From 009e972f79264ec4330f1da38644f07003e6014f Mon Sep 17 00:00:00 2001 From: sangchengmeng Date: Wed, 7 May 2025 17:34:28 +0800 Subject: [PATCH 01/14] add_lightllm_kernel --- .../multimodal_model_quickstart.rst | 11 + lightllm-kernel/CMakeLists.txt | 65 ++++ lightllm-kernel/Makefile | 9 + lightllm-kernel/README-CH.md | 42 +++ lightllm-kernel/README.md | 39 +++ lightllm-kernel/csrc/cuda_compat.h | 49 +++ lightllm-kernel/csrc/moe/grouped_topk.cu | 319 ++++++++++++++++++ .../csrc/moe/grouped_topk_interface.cpp | 48 +++ lightllm-kernel/csrc/ops_bindings.cpp | 56 +++ lightllm-kernel/include/ops_common.h | 4 + lightllm-kernel/lightllm_kernel/__init__.py | 3 + .../lightllm_kernel/ops/__init__.py | 44 +++ .../lightllm_kernel/ops/attention.py | 0 lightllm-kernel/lightllm_kernel/ops/moe.py | 0 lightllm-kernel/lightllm_kernel/ops/quant.py | 0 lightllm-kernel/setup.py | 40 +++ 16 files changed, 729 insertions(+) create mode 100644 docs/CN/source/getting_started/multimodal_model_quickstart.rst create mode 100644 lightllm-kernel/CMakeLists.txt create mode 100644 lightllm-kernel/Makefile create mode 100644 lightllm-kernel/README-CH.md create mode 100644 lightllm-kernel/README.md create mode 100644 lightllm-kernel/csrc/cuda_compat.h create mode 100644 lightllm-kernel/csrc/moe/grouped_topk.cu create mode 100644 lightllm-kernel/csrc/moe/grouped_topk_interface.cpp create mode 100644 lightllm-kernel/csrc/ops_bindings.cpp create mode 100644 lightllm-kernel/include/ops_common.h create mode 100644 lightllm-kernel/lightllm_kernel/__init__.py create mode 100644 lightllm-kernel/lightllm_kernel/ops/__init__.py create mode 100644 lightllm-kernel/lightllm_kernel/ops/attention.py create mode 100644 lightllm-kernel/lightllm_kernel/ops/moe.py create mode 100644 lightllm-kernel/lightllm_kernel/ops/quant.py create mode 100644 lightllm-kernel/setup.py diff --git a/docs/CN/source/getting_started/multimodal_model_quickstart.rst b/docs/CN/source/getting_started/multimodal_model_quickstart.rst new file mode 100644 index 000000000..cc3eaf724 --- /dev/null +++ b/docs/CN/source/getting_started/multimodal_model_quickstart.rst @@ -0,0 +1,11 @@ +..multimodal_model_quickstart.rst +------------------------- + +下载多模态模型(如llava系列、internvl系列、qwen_vl系列等)的模型以后,在终端使用下面的代码部署API服务: + +.. code-block:: console + + $ python -m lightllm.server.api_server --model_dir ~/models/llava-7b-chat --use_dynamic_prompt_cache --enable_multimodal + +.. note:: + 上面代码中的 ``--model_dir`` 参数需要修改为你本机实际的模型路径。 diff --git a/lightllm-kernel/CMakeLists.txt b/lightllm-kernel/CMakeLists.txt new file mode 100644 index 000000000..c61ed9dd8 --- /dev/null +++ b/lightllm-kernel/CMakeLists.txt @@ -0,0 +1,65 @@ +cmake_minimum_required(VERSION 3.22) +project(lightllm_kernel LANGUAGES CXX CUDA) + +# GPU 架构:缺省支持 A100(80)、Ampere(86)、Ada/L40s/4090(89)、Hopper(90), +if(NOT CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 80;86;89;90-virtual) +endif() + +# 找 PyTorch & Python +find_package(Torch REQUIRED) +find_package(Python REQUIRED COMPONENTS Development) + +# 收集 csrc 下的 .cpp/.cu +file(GLOB_RECURSE SRC_CPP CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/csrc/*.cpp") +file(GLOB_RECURSE SRC_CUDA CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/csrc/*.cu") + +# 编译生成 Python 扩展, _C.so +add_library(_C SHARED ${SRC_CPP} ${SRC_CUDA}) + +# C++17 更方便调度宏 +target_compile_features(_C PRIVATE cxx_std_17) +target_include_directories(_C PRIVATE ${TORCH_INCLUDE_DIRS}) +target_link_libraries(_C + PRIVATE + ${TORCH_LIBRARIES} + Python::Python) + + +# 输出文件名 _C.so,无前缀 +set_target_properties(_C PROPERTIES + PREFIX "" + OUTPUT_NAME "_C" + BUILD_RPATH "\$ORIGIN;\$ORIGIN/../torch/lib" + INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../torch/lib" +) + +# 安装:把 _C.so、Python 包和 csrc 一起拷到 site-packages +include(GNUInstallDirs) + +# 1) 计算 Python site-packages 路径 +execute_process( + COMMAND ${Python_EXECUTABLE} - < A100, 8.6-> A10, 8.9-> L40s/4090, 9.0+PTX-> Hopper + TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0+PTX" \ + python -m pip install -v . + +clean: + rm -rf build dist *.egg-info \ No newline at end of file diff --git a/lightllm-kernel/README-CH.md b/lightllm-kernel/README-CH.md new file mode 100644 index 000000000..647a594b8 --- /dev/null +++ b/lightllm-kernel/README-CH.md @@ -0,0 +1,42 @@ +# LightLLM-Kernel + +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) + +lightllm-kernel 是大模型推理系统 LightLLM 的 CUDA 算子库。它提供了在大型模型推理过程中所需的一系列自定义 GPU 运算算子,以加速关键步骤的计算。 + +## 功能列表 + +| Module | Description | +|--------------|-------------------------------------------------------------------------------------------------| +| **Attention** | Optimized Multi-Head Attention kernels with fused QKV operations and efficient softmax | +| **MoE** | Expert routing and computation kernels for Mixture-of-Experts architectures | +| **Quant** | Low-precision quantization support (INT8/INT4) for weights and activations | +| **Extensions**| Continuous expansion of optimized operations for emerging model architectures | + +## 安装方法 + +lightllm_kernel 提供了静态编译以及JIT(Just-In-Time)动态编译的安装方式。推荐使用静态编译安装以获得最佳性能,同时也支持开发者使用可编辑安装进行开发调试。 + +### System Requirements +- NVIDIA GPU with Compute Capability ≥ 7.0 (Volta+) +- CUDA 11.8 or higher +- Python 3.8+ + +### Installation Methods + +#### Static Compilation (Recommended) +```bash +git clone https://github.com/YourUsername/lightllm_kernel.git +cd lightllm_kernel +make build +# Alternative using pip +pip install . +``` + +## 贡献指南 +欢迎社区开发者为 lightllm_kernel 做出贡献!如果您计划新增自定义算子或改进现有功能,请参考以下指南: +- 新增算子实现:在 csrc/ 目录下添加您的 CUDA/C++ 源码文件,添加时建议参考现有算子的代码风格和结构。 +- 注册Python接口:在 csrc/ops_bindings.cpp中,将新增的算子通过 PyBind11 或 TORCH_LIBRARY 等机制注册到 Python 接口。 +- 导出算子到Python模块:在lightllm_kernel/ops/__init__.py只添加相应的导出代码,使新算子包含在 lightllm_kernel.ops 模块中。 +- 本地测试:开发完成后,请在本地对您的更改进行测试。您可以编译安装新的版本并编写简单的脚本调用新算子,检查其功能和性能是否符合预期。如果项目附带了测试用例,也请运行所有测试确保不引入回归。 +- \ No newline at end of file diff --git a/lightllm-kernel/README.md b/lightllm-kernel/README.md new file mode 100644 index 000000000..9ce4bce41 --- /dev/null +++ b/lightllm-kernel/README.md @@ -0,0 +1,39 @@ +# LightLLM-Kernel + +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) + +LightLLM-Kernel is a high-performance CUDA kernel library powering the LightLLM inference system. It provides optimized GPU implementations for critical operations in large language model (LLM) inference, delivering significant performance improvements through carefully crafted CUDA kernels. + +## Project Overview + +LightLLM-Kernel serves as the computational backbone for LightLLM framework, offering: +- **Custom CUDA Kernels**: Highly optimized implementations for transformer-based model operations +- **Memory Efficiency**: Reduced memory footprint through advanced quantization techniques +- **Scalability**: Support for large model architectures including MoE (Mixture-of-Experts) models + +## Key Features + +### Core Modules +| Module | Description | +|--------------|-------------------------------------------------------------------------------------------------| +| **Attention** | Optimized Multi-Head Attention kernels with fused QKV operations and efficient softmax | +| **MoE** | Expert routing and computation kernels for Mixture-of-Experts architectures | +| **Quant** | Low-precision quantization support (INT8/INT4) for weights and activations | +| **Extensions**| Continuous expansion of optimized operations for emerging model architectures | + +## Installation + +### System Requirements +- NVIDIA GPU with Compute Capability ≥ 7.0 (Volta+) +- CUDA 11.8 or higher +- Python 3.8+ + +### Installation Methods + +#### Static Compilation (Recommended) +```bash +git clone https://github.com/YourUsername/lightllm_kernel.git +cd lightllm_kernel +make build +# Alternative using pip +pip install . \ No newline at end of file diff --git a/lightllm-kernel/csrc/cuda_compat.h b/lightllm-kernel/csrc/cuda_compat.h new file mode 100644 index 000000000..82e55613d --- /dev/null +++ b/lightllm-kernel/csrc/cuda_compat.h @@ -0,0 +1,49 @@ +#pragma once + +#ifdef USE_ROCM + #include +#endif + +#ifndef USE_ROCM + #define WARP_SIZE 32 +#else + #define WARP_SIZE warpSize +#endif + +#ifndef USE_ROCM + #define VLLM_LDG(arg) __ldg(arg) +#else + #define VLLM_LDG(arg) *(arg) +#endif + +#ifndef USE_ROCM + #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \ + __shfl_xor_sync(uint32_t(-1), var, lane_mask) + #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \ + __shfl_xor_sync(uint32_t(-1), var, lane_mask, width) +#else + #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask) + #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \ + __shfl_xor(var, lane_mask, width) +#endif + +#ifndef USE_ROCM + #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane) +#else + #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane) +#endif + +#ifndef USE_ROCM + #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \ + __shfl_down_sync(uint32_t(-1), var, lane_delta) +#else + #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta) +#endif + +#ifndef USE_ROCM + #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ + cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL) +#else + #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ + hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) +#endif diff --git a/lightllm-kernel/csrc/moe/grouped_topk.cu b/lightllm-kernel/csrc/moe/grouped_topk.cu new file mode 100644 index 000000000..635ca5193 --- /dev/null +++ b/lightllm-kernel/csrc/moe/grouped_topk.cu @@ -0,0 +1,319 @@ +#include +#include +#include +#include +#include +#include "../cuda_compat.h" + +#ifndef USE_ROCM + #include + #include +#else + #include + #include +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +namespace moe { + +template +__launch_bounds__(TPB) +__global__ void moeGroupedTopK( + const float* input, + const bool* finished, + float* inputs_after_softmax, + const int num_cols, + const float* correction_bias, + float* group_scores, + float* output, // topk_weights + int* indices, // topk_indices + int* group_indices, // token_expert_indices + const int num_experts, + const int num_expert_group, + const int topk_group, + const int k, + const bool renormalize, + const bool softmax_or_sigmoid, + const int start_expert, + const int end_expert) +{ + + const int thread_row_offset = blockIdx.x * num_cols; + + if(softmax_or_sigmoid) + { + //softmax + using BlockReduce_topk = cub::BlockReduce; + __shared__ typename BlockReduce_topk::TempStorage tmpStorage; + + __shared__ float normalizing_factor; + __shared__ float float_max; + + cub::Sum sum; + float threadData(-FLT_MAX); + + // Don't touch finished rows. + if ((finished != nullptr) && finished[blockIdx.x]) + { + return; + } + + for (int i = threadIdx.x; i < num_cols; i += TPB) + { + const int idx = thread_row_offset + i; + threadData = max(static_cast(input[idx]), threadData); + } + + const float maxElem = BlockReduce_topk(tmpStorage).Reduce(threadData, cub::Max()); + if (threadIdx.x == 0) + { + float_max = maxElem; + } + __syncthreads(); + + threadData = 0; + + for (int ii = threadIdx.x; ii < num_cols; ii += TPB) + { + const int idx = thread_row_offset + ii; + threadData += exp((static_cast(input[idx]) - float_max)); + } + + const auto Z = BlockReduce_topk(tmpStorage).Reduce(threadData, sum); + + if (threadIdx.x == 0) + { + normalizing_factor = 1.f / Z; + } + __syncthreads(); + + for (int ii = threadIdx.x; ii < num_cols; ii += TPB) + { + const int idx = thread_row_offset + ii; + const float val = exp((static_cast(input[idx]) - float_max)) * normalizing_factor; + inputs_after_softmax[idx] = val + (correction_bias ? correction_bias[idx] : 0.f); + } + } else { + // sigmoid + for (int i = threadIdx.x; i < num_cols; i += TPB) + { + const int idx = thread_row_offset + i; + float val = 1.f / (1.f + expf(-input[idx])); + inputs_after_softmax[idx] = val + (correction_bias ? correction_bias[idx] : 0.f); + } + } + __syncthreads(); + + using cub_kvp = cub::KeyValuePair; + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage tmpStorage_kvp; + + int block_row = blockIdx.x; // (0 - tokens-1) 即0-199 + int thread_read_offset = block_row * num_experts; + + int group_size = num_experts / num_expert_group; + + for(int group_id = threadIdx.x; group_id < num_expert_group; group_id += TPB) + { + float local_max = -FLT_MAX; + const int start = group_id * group_size; + const int end = (group_id + 1) * group_size; + + // find max in this group + for(int e = start; e < end; e++) + { + float val = inputs_after_softmax[thread_read_offset + e]; + local_max = fmaxf(local_max, val); + } + + // store max in group_scores + group_scores[block_row * num_expert_group + group_id] = local_max; + } + __syncthreads(); + + cub_kvp thread_kvp; + cub::ArgMax arg_max; + + const bool row_is_active = finished ? !finished[block_row] : true; + thread_read_offset = blockIdx.x * num_expert_group; + + for (int k_idx = 0; k_idx < topk_group; ++k_idx) + { + thread_kvp.key = 0; + thread_kvp.value = -1.f; // This is OK because inputs are probabilities + + // every thread finds the max expert in a different expert group + cub_kvp inp_kvp; + for (int expert = threadIdx.x; expert < num_expert_group; expert += TPB) + { + const int idx = thread_read_offset + expert; + inp_kvp.key = expert; + inp_kvp.value = group_scores[idx]; + + for (int prior_k = 0; prior_k < k_idx; ++prior_k) + { + const int prior_winning_expert = group_indices[topk_group * block_row + prior_k]; + + if (prior_winning_expert == expert) + { + inp_kvp = thread_kvp; + } + } + + thread_kvp = arg_max(inp_kvp, thread_kvp); + } + + const cub_kvp result_kvp = BlockReduce(tmpStorage_kvp).Reduce(thread_kvp, arg_max); + if (threadIdx.x == 0) + { + // Ignore experts the node isn't responsible for with expert parallelism + const int expert = result_kvp.key; + const bool node_uses_expert = expert >= start_expert && expert < end_expert; + const bool should_process_row = row_is_active && node_uses_expert; + + const int idx = topk_group * block_row + k_idx; + group_indices[idx] = should_process_row ? (expert - start_expert) : num_expert_group; + assert(group_indices[idx] >= 0); + } + __syncthreads(); + } + + int score_offset = block_row * num_experts; + for (int e = threadIdx.x; e < num_experts; e += TPB) + { + int grp = e / group_size; + bool selected = false; + // selected = True if e in group_indices[block_row, :] + for (int i = 0; i < topk_group; i++) { + int sel_grp = group_indices[block_row * topk_group + i]; + if (sel_grp == grp) { + selected = true; + break; + } + } + if (!selected) { + inputs_after_softmax[score_offset + e] = 0.0f; + } + } + __syncthreads(); + + for (int tk = 0; tk < k; tk++) { + thread_kvp.key = -1; + thread_kvp.value = -FLT_MAX; + for (int e = threadIdx.x; e < num_experts; e += TPB) { + bool already_selected = false; + for (int prev = 0; prev < tk; prev++) { + if (indices[block_row * k + prev] == e) { + already_selected = true; + break; + } + } + float val = already_selected ? -FLT_MAX : inputs_after_softmax[score_offset + e]; + cub_kvp inp; + inp.key = e; + inp.value = val; + thread_kvp = arg_max(inp, thread_kvp); + } + cub_kvp result = BlockReduce(tmpStorage_kvp).Reduce(thread_kvp, arg_max); + if (threadIdx.x == 0) { + output[block_row * k + tk] = result.value; + indices[block_row * k + tk] = result.key; + } + __syncthreads(); + } + + // renormalize + if (threadIdx.x == 0 && renormalize) { + float sum = 0.0f; + int out_offset = block_row * k; + for (int j = 0; j < k; j++) { + sum += output[out_offset + j]; + } + // avoid division by zero + if (sum > 0.0f) { + for (int j = 0; j < k; j++) { + output[out_offset + j] /= sum; + } + } + } + __syncthreads(); + +} + +void GroupedTopKKernelLauncher( + const float* gating_output, + const float* correction_bias, + float* topk_weights, + int* topk_indicies, + int* group_indices, + float* softmax_workspace, + float* group_scores, + const int num_tokens, + const int num_experts, + const int num_expert_group, + const int topk_group, + const int topk, + const bool renormalize, + const bool softmax_or_sigmoid, + cudaStream_t stream) { + + static constexpr int TPB = 256; + moeGroupedTopK<<>>( + gating_output, nullptr, softmax_workspace, num_experts, correction_bias, + group_scores, topk_weights, topk_indicies, group_indices, + num_experts, num_expert_group, topk_group, topk, renormalize, softmax_or_sigmoid, 0, num_experts); +} + +} // namespace moe + +void grouped_topk_cuda( + torch::Tensor& topk_weights, // [num_tokens, topk] + torch::Tensor& correction_bias, // [num_tokens, num_experts] + torch::Tensor& topk_indices, // [num_tokens, topk] + torch::Tensor& group_indices, // [num_tokens, topk_group] + torch::Tensor& gating_output, // [num_tokens, num_experts] + const int num_expert_group, + const int topk_group, + const int topk, + const bool renormalize, + std::string scoring_func, + torch::Tensor group_scores = torch::Tensor() // [num_tokens, num_expert_group] + ) +{ + const int num_experts = gating_output.size(-1); + const int num_tokens = gating_output.numel() / num_experts; + + const int64_t workspace_size = num_tokens * num_experts; + + const bool softmax_or_sigmoid = (scoring_func == "softmax") ? true : false; + + float* d_group_scores = nullptr; + if (group_scores.defined() && group_scores.numel() > 0) { + d_group_scores = group_scores.data_ptr(); + } else { + cudaMalloc(&d_group_scores, num_tokens * num_expert_group * sizeof(float)); + cudaMemset(d_group_scores, 0, num_tokens * num_expert_group * sizeof(float)); + } + + const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options()); + moe::GroupedTopKKernelLauncher( + gating_output.data_ptr(), + correction_bias.defined() ? correction_bias.data_ptr() : nullptr, + topk_weights.data_ptr(), + topk_indices.data_ptr(), + group_indices.data_ptr(), + softmax_workspace.data_ptr(), + d_group_scores, + num_tokens, + num_experts, + num_expert_group, + topk_group, + topk, + renormalize, + softmax_or_sigmoid, + stream); +} \ No newline at end of file diff --git a/lightllm-kernel/csrc/moe/grouped_topk_interface.cpp b/lightllm-kernel/csrc/moe/grouped_topk_interface.cpp new file mode 100644 index 000000000..f35c92caa --- /dev/null +++ b/lightllm-kernel/csrc/moe/grouped_topk_interface.cpp @@ -0,0 +1,48 @@ +#include +#include "../../include/ops_common.h" + + +void grouped_topk_cuda( + torch::Tensor& topk_weights, + torch::Tensor& correction_bias, + torch::Tensor& topk_indices, + torch::Tensor& group_indices, + torch::Tensor& gating_output, + int num_expert_group, + int topk_group, + int topk, + bool renormalize, + std::string scoring_func, + torch::Tensor group_scores = torch::Tensor()); + +torch::Tensor grouped_topk( + torch::Tensor topk_weights, + torch::Tensor correction_bias, + torch::Tensor topk_indices, + torch::Tensor group_indices, + torch::Tensor gating_output, + int num_expert_group, + int topk_group, + int topk, + bool renormalize, + std::string scoring_func, + torch::Tensor group_scores /* = {} */) { + + TORCH_CHECK(topk_weights.is_cuda(), "topk_weights must be CUDA tensor"); + TORCH_CHECK(gating_output.is_cuda(), "gating_output must be CUDA tensor"); + + grouped_topk(topk_weights, + correction_bias, + topk_indices, + group_indices, + gating_output, + num_expert_group, + topk_group, + topk, + renormalize, + scoring_func, + group_scores); + + // 就地写结果,所以这里直接返回topk_weights + return topk_weights; +} diff --git a/lightllm-kernel/csrc/ops_bindings.cpp b/lightllm-kernel/csrc/ops_bindings.cpp new file mode 100644 index 000000000..e21a9d376 --- /dev/null +++ b/lightllm-kernel/csrc/ops_bindings.cpp @@ -0,0 +1,56 @@ +#include +#include "../include/ops_common.h" +#include + +void grouped_topk_cuda( + torch::Tensor& topk_weights, + torch::Tensor& correction_bias, + torch::Tensor& topk_indices, + torch::Tensor& group_indices, + torch::Tensor& gating_output, + int num_expert_group, + int topk_group, + int topk, + bool renormalize, + std::string scoring_func, + torch::Tensor group_scores); + + +torch::Tensor grouped_topk( + torch::Tensor topk_weights, + torch::Tensor correction_bias, + torch::Tensor topk_indices, + torch::Tensor group_indices, + torch::Tensor gating_output, + int64_t num_expert_group, + int64_t topk_group, + int64_t topk, + bool renormalize, + std::string scoring_func, + torch::Tensor group_scores) { + + grouped_topk_cuda(topk_weights, correction_bias, topk_indices, group_indices, + gating_output, + static_cast(num_expert_group), + static_cast(topk_group), + static_cast(topk), + renormalize, scoring_func, group_scores); + + return topk_weights; +} + +PYBIND11_MODULE(_C, m) { + m.def("grouped_topk", &grouped_topk, + "Grouped Top-K routing (CUDA)", + py::arg("topk_weights"), + py::arg("correction_bias"), + py::arg("topk_indices"), + py::arg("group_indices"), + py::arg("gating_output"), + py::arg("num_expert_group"), + py::arg("topk_group"), + py::arg("topk"), + py::arg("renormalize"), + py::arg("scoring_func"), + py::arg("group_scores") = torch::Tensor()); +} \ No newline at end of file diff --git a/lightllm-kernel/include/ops_common.h b/lightllm-kernel/include/ops_common.h new file mode 100644 index 000000000..3c80fef44 --- /dev/null +++ b/lightllm-kernel/include/ops_common.h @@ -0,0 +1,4 @@ +#pragma once +#include +#include +#include \ No newline at end of file diff --git a/lightllm-kernel/lightllm_kernel/__init__.py b/lightllm-kernel/lightllm_kernel/__init__.py new file mode 100644 index 000000000..23c3bd2b0 --- /dev/null +++ b/lightllm-kernel/lightllm_kernel/__init__.py @@ -0,0 +1,3 @@ +from . import ops # noqa: F401 + +__all__ = ["ops"] diff --git a/lightllm-kernel/lightllm_kernel/ops/__init__.py b/lightllm-kernel/lightllm_kernel/ops/__init__.py new file mode 100644 index 000000000..c3f54642b --- /dev/null +++ b/lightllm-kernel/lightllm_kernel/ops/__init__.py @@ -0,0 +1,44 @@ +import importlib +import os +from pathlib import Path +from torch.utils.cpp_extension import load + +PKG = "lightllm_kernel" +try: + _C = importlib.import_module(f"{PKG}._C") +except ImportError: + repo_root = Path(__file__).resolve().parents[2] + csrc_dir = repo_root / "csrc" + if not csrc_dir.exists(): + raise ImportError( + "Cannot import compiled extension 'lightllm_kernel.ops' and no source " + "directory (csrc/) found; please ensure you have run " + "'cmake --install' or placed lightllm_kernel.ops.so on PYTHONPATH." + ) + + sources = ( + [str(p) for p in (csrc_dir / "moe").glob("*.cpp")] + + [str(p) for p in (csrc_dir / "moe").glob("*.cu")] + + [str(csrc_dir / "ops_bindings.cpp")] + ) + + _C = load( + name="lightllm_kernel._C", + sources=sources, + verbose=True, + extra_cuda_cflags=[ + # A100 + "-gencode=arch=compute_80,code=sm_80", + "-gencode=arch=compute_80,code=compute_80", + # Ada / L40s / 4090 + "-gencode=arch=compute_89,code=sm_89", + "-gencode=arch=compute_89,code=compute_89", + # Hopper / H100 / H200 + "-gencode=arch=compute_90,code=sm_90", + "-gencode=arch=compute_90,code=compute_90", + ], + ) + +# 向外暴露 Python 端接口 +grouped_topk = _C.grouped_topk +__all__ = ["grouped_topk"] diff --git a/lightllm-kernel/lightllm_kernel/ops/attention.py b/lightllm-kernel/lightllm_kernel/ops/attention.py new file mode 100644 index 000000000..e69de29bb diff --git a/lightllm-kernel/lightllm_kernel/ops/moe.py b/lightllm-kernel/lightllm_kernel/ops/moe.py new file mode 100644 index 000000000..e69de29bb diff --git a/lightllm-kernel/lightllm_kernel/ops/quant.py b/lightllm-kernel/lightllm_kernel/ops/quant.py new file mode 100644 index 000000000..e69de29bb diff --git a/lightllm-kernel/setup.py b/lightllm-kernel/setup.py new file mode 100644 index 000000000..34f992b73 --- /dev/null +++ b/lightllm-kernel/setup.py @@ -0,0 +1,40 @@ +from pathlib import Path +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +this_dir = Path(__file__).parent + +sources = [ + str(this_dir / "csrc" / "moe" / "grouped_topk_interface.cpp"), + str(this_dir / "csrc" / "moe" / "grouped_topk.cu"), + str(this_dir / "csrc" / "ops_bindings.cpp"), +] +print("---- sources for CUDAExtension ----") +for s in sources: + print(s) +print("-----------------------------------") +ext_modules = [ + CUDAExtension( + name="lightllm_kernel._C", + sources=sources, + extra_compile_args={ + "cxx": ["-O3"], + "nvcc": [ + "-O3", + "--use_fast_math", + "-gencode=arch=compute_90,code=sm_90", + "-gencode=arch=compute_90,code=compute_90", + ], + }, + include_dirs=[str(this_dir / "include")], + ) +] + +setup( + name="lightllm_kernel", + packages=["lightllm_kernel", "lightllm_kernel.ops"], + version="0.1", + ext_modules=ext_modules, + cmdclass={"build_ext": BuildExtension}, + package_dir={"ops": "ops"}, +) From 8b5f18b0f0b26c734d85e3e6cca9bcd1ea387a9f Mon Sep 17 00:00:00 2001 From: Xtra <571889291@qq.com> Date: Fri, 9 May 2025 12:49:06 +0800 Subject: [PATCH 02/14] feat(vit_cuda_kernels):add norm quant and some fused ops (#886) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # vit fp8w8a8量化推理相关算子优化 ## 新增算子 1. rmsnorm_bf16,性能较pytorch较大提升 2. pre_tp_norm,融合了tp_norm的通信前操作 3. post_tp_norm,融合了tp_norm的通信后操作 4. pre_token_quant,逐token FP8量化,性能较vllm的quant极大提升,较sgl的quant性能更好 5. gelu_per_token_quant,融合了GELU激活 + 逐token FP8量化 6. add_norm_quant,融合了attention与mlp模块间的,add norm quant操作 7. cutlass_scaled_mm_bias_ls,融合了量化矩阵乘、反量化和可选的bias和ls weight --- lightllm-kernel/csrc/fusion/add_norm_quant.cu | 551 + .../csrc/fusion/gelu_per_token_quant.cu | 367 + lightllm-kernel/csrc/fusion/post_tp_norm.cu | 364 + lightllm-kernel/csrc/fusion/pre_tp_norm.cu | 257 + lightllm-kernel/csrc/gemm/Epilogues.md | 147 + lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu | 73 + lightllm-kernel/csrc/gemm/scaled_mm_c3x.cuh | 161 + .../gemm/scaled_mm_c3x_sm90_fp8_dispatch.cuh | 97 + lightllm-kernel/csrc/gemm/scaled_mm_entry.cu | 83 + lightllm-kernel/csrc/moe/grouped_topk.cu | 35 +- .../csrc/moe/grouped_topk_interface.cpp | 48 - lightllm-kernel/csrc/norm/rmsnorm_bf16.cu | 350 + lightllm-kernel/csrc/ops_bindings.cpp | 52 +- .../csrc/quant/per_token_quantize_bf16.cu | 342 + .../cutlass/include/cute/algorithm/axpby.hpp | 95 + .../cutlass/include/cute/algorithm/clear.hpp | 64 + .../cute/algorithm/cooperative_copy.hpp | 333 + .../cute/algorithm/cooperative_gemm.hpp | 512 + .../cutlass/include/cute/algorithm/copy.hpp | 382 + .../cutlass/include/cute/algorithm/fill.hpp | 87 + .../include/cute/algorithm/functional.hpp | 290 + .../cutlass/include/cute/algorithm/gemm.hpp | 500 + .../cutlass/include/cute/algorithm/prefer.hpp | 46 + .../include/cute/algorithm/prefetch.hpp | 145 + .../cute/algorithm/tensor_algorithms.hpp | 166 + .../cute/algorithm/tuple_algorithms.hpp | 1073 + .../include/cute/arch/cluster_sm90.hpp | 245 + .../cutlass/include/cute/arch/config.hpp | 50 + .../cutlass/include/cute/arch/copy.hpp | 107 + .../cutlass/include/cute/arch/copy_sm50.hpp | 98 + .../cutlass/include/cute/arch/copy_sm75.hpp | 236 + .../cutlass/include/cute/arch/copy_sm80.hpp | 198 + .../cutlass/include/cute/arch/copy_sm90.hpp | 219 + .../include/cute/arch/copy_sm90_desc.hpp | 440 + .../include/cute/arch/copy_sm90_tma.hpp | 1395 + .../cutlass/include/cute/arch/mma.hpp | 64 + .../cutlass/include/cute/arch/mma_sm61.hpp | 87 + .../cutlass/include/cute/arch/mma_sm70.hpp | 329 + .../cutlass/include/cute/arch/mma_sm75.hpp | 120 + .../cutlass/include/cute/arch/mma_sm80.hpp | 2243 + .../cutlass/include/cute/arch/mma_sm90.hpp | 9331 +++ .../include/cute/arch/mma_sm90_desc.hpp | 156 + .../include/cute/arch/mma_sm90_gmma.hpp | 20974 ++++++ .../include/cute/arch/mma_sm90_gmma_ext.hpp | 56445 +++++++++++++++ .../cute/arch/mma_sm90_gmma_sparse.hpp | 22743 ++++++ .../cute/arch/mma_sm90_gmma_sparse_ext.hpp | 60445 ++++++++++++++++ .../cutlass/include/cute/arch/util.hpp | 320 + .../cutlass/include/cute/atom/copy_atom.hpp | 764 + .../cutlass/include/cute/atom/copy_traits.hpp | 159 + .../include/cute/atom/copy_traits_sm50.hpp | 75 + .../include/cute/atom/copy_traits_sm75.hpp | 143 + .../include/cute/atom/copy_traits_sm80.hpp | 194 + .../include/cute/atom/copy_traits_sm90.hpp | 132 + .../cute/atom/copy_traits_sm90_im2col.hpp | 940 + .../cute/atom/copy_traits_sm90_tma.hpp | 1525 + .../atom/copy_traits_sm90_tma_swizzle.hpp | 93 + .../cutlass/include/cute/atom/mma_atom.hpp | 1117 + .../cutlass/include/cute/atom/mma_traits.hpp | 189 + .../include/cute/atom/mma_traits_sm61.hpp | 73 + .../include/cute/atom/mma_traits_sm70.hpp | 198 + .../include/cute/atom/mma_traits_sm75.hpp | 81 + .../include/cute/atom/mma_traits_sm80.hpp | 489 + .../include/cute/atom/mma_traits_sm90.hpp | 144 + .../cute/atom/mma_traits_sm90_gmma.hpp | 8999 +++ .../cute/atom/mma_traits_sm90_gmma_ext.hpp | 20116 +++++ .../cute/atom/mma_traits_sm90_gmma_sparse.hpp | 7738 ++ .../atom/mma_traits_sm90_gmma_sparse_ext.hpp | 17335 +++++ .../cutlass/include/cute/config.hpp | 149 + .../include/cute/container/alignment.hpp | 70 + .../cutlass/include/cute/container/array.hpp | 492 + .../include/cute/container/array_aligned.hpp | 42 + .../include/cute/container/array_subbyte.hpp | 643 + .../include/cute/container/bit_field.hpp | 133 + .../include/cute/container/cuda_types.hpp | 183 + .../include/cute/container/packed_tuple.hpp | 254 + .../cutlass/include/cute/container/tuple.hpp | 744 + .../include/cute/container/type_list.hpp | 124 + .../cutlass/include/cute/int_tuple.hpp | 864 + .../cutlass/include/cute/layout.hpp | 2058 + .../cutlass/include/cute/layout_composed.hpp | 652 + .../include/cute/numeric/arithmetic_tuple.hpp | 556 + .../cutlass/include/cute/numeric/complex.hpp | 76 + .../cutlass/include/cute/numeric/int.hpp | 106 + .../include/cute/numeric/integer_sequence.hpp | 151 + .../cute/numeric/integral_constant.hpp | 517 + .../include/cute/numeric/integral_ratio.hpp | 264 + .../cutlass/include/cute/numeric/math.hpp | 356 + .../include/cute/numeric/numeric_types.hpp | 135 + .../cutlass/include/cute/numeric/real.hpp | 74 + .../cutlass/include/cute/pointer.hpp | 322 + .../cutlass/include/cute/pointer_base.hpp | 246 + .../cutlass/include/cute/pointer_flagged.hpp | 199 + .../cutlass/include/cute/pointer_sparse.hpp | 172 + .../cutlass/include/cute/pointer_swizzle.hpp | 168 + .../cutlass/include/cute/stride.hpp | 598 + .../cutlass/include/cute/swizzle.hpp | 498 + .../cutlass/include/cute/swizzle_layout.hpp | 584 + .../cutlass/include/cute/tensor.hpp | 58 + .../cutlass/include/cute/tensor_impl.hpp | 1193 + .../cutlass/include/cute/tensor_predicate.hpp | 78 + .../cutlass/include/cute/tensor_zip.hpp | 243 + .../cutlass/include/cute/underscore.hpp | 194 + .../cutlass/include/cute/util/debug.hpp | 164 + .../cutlass/include/cute/util/print.hpp | 261 + .../cutlass/include/cute/util/type_traits.hpp | 292 + .../cutlass/include/cutlass/aligned_buffer.h | 129 + .../cutlass/include/cutlass/arch/arch.h | 109 + .../cutlass/include/cutlass/arch/barrier.h | 630 + .../include/cutlass/arch/cache_operation.h | 66 + .../cutlass/include/cutlass/arch/config.h | 81 + .../cutlass/arch/grid_dependency_control.h | 84 + .../cutlass/include/cutlass/arch/memory.h | 602 + .../include/cutlass/arch/memory_sm75.h | 269 + .../include/cutlass/arch/memory_sm80.h | 472 + .../cutlass/include/cutlass/arch/mma.h | 269 + .../cutlass/include/cutlass/arch/mma_sm50.h | 432 + .../cutlass/include/cutlass/arch/mma_sm60.h | 252 + .../cutlass/include/cutlass/arch/mma_sm61.h | 142 + .../cutlass/include/cutlass/arch/mma_sm70.h | 665 + .../cutlass/include/cutlass/arch/mma_sm75.h | 793 + .../cutlass/include/cutlass/arch/mma_sm80.h | 1505 + .../cutlass/include/cutlass/arch/mma_sm89.h | 367 + .../cutlass/include/cutlass/arch/mma_sm90.h | 245 + .../include/cutlass/arch/mma_sparse_sm80.h | 1238 + .../include/cutlass/arch/mma_sparse_sm89.h | 409 + .../include/cutlass/arch/reg_reconfig.h | 67 + .../cutlass/include/cutlass/arch/simd.h | 125 + .../cutlass/include/cutlass/arch/simd_sm60.h | 104 + .../cutlass/include/cutlass/arch/simd_sm61.h | 147 + .../cutlass/include/cutlass/arch/synclog.hpp | 1324 + .../cutlass/include/cutlass/arch/wmma.h | 223 + .../cutlass/include/cutlass/arch/wmma_sm70.h | 136 + .../cutlass/include/cutlass/arch/wmma_sm72.h | 210 + .../cutlass/include/cutlass/arch/wmma_sm75.h | 207 + .../cutlass/include/cutlass/array.h | 2614 + .../include/cutlass/array_planar_complex.h | 89 + .../cutlass/include/cutlass/array_subbyte.h | 559 + .../cutlass/include/cutlass/barrier.h | 377 + .../cutlass/include/cutlass/bfloat16.h | 679 + .../cutlass/include/cutlass/blas3.h | 143 + .../cutlass/include/cutlass/blas3_types.h | 78 + .../cutlass/include/cutlass/block_striped.h | 267 + .../include/cutlass/cluster_launch.hpp | 275 + .../cutlass/include/cutlass/complex.h | 823 + .../cutlass/include/cutlass/constants.h | 1239 + .../conv/collective/builders/sm90_common.inl | 96 + .../collective/builders/sm90_gmma_builder.inl | 257 + .../conv/collective/collective_builder.hpp | 93 + .../conv/collective/collective_conv.hpp | 62 + .../cutlass/conv/collective/detail.hpp | 254 + ..._implicit_gemm_gmma_ss_warpspecialized.hpp | 663 + .../cutlass/conv/conv2d_problem_size.h | 654 + .../cutlass/conv/conv3d_problem_size.h | 513 + .../cutlass/conv/convnd_problem_shape.hpp | 561 + .../include/cutlass/conv/convolution.h | 194 + .../cutlass/include/cutlass/conv/detail.hpp | 137 + .../conv/device/conv_universal_adapter.hpp | 421 + .../cutlass/conv/device/direct_convolution.h | 270 + .../conv/device/implicit_gemm_convolution.h | 361 + .../device/implicit_gemm_convolution_fusion.h | 269 + .../include/cutlass/conv/dispatch_policy.hpp | 90 + .../cutlass/conv/kernel/conv_universal.hpp | 65 + .../cutlass/conv/kernel/default_conv2d.h | 322 + .../conv/kernel/default_conv2d_dgrad.h | 1927 + .../conv/kernel/default_conv2d_fprop.h | 2007 + .../conv/kernel/default_conv2d_fprop_fusion.h | 357 + .../kernel/default_conv2d_fprop_with_absmax.h | 127 + .../default_conv2d_fprop_with_broadcast.h | 221 + .../default_conv2d_fprop_with_reduction.h | 130 + .../conv/kernel/default_conv2d_group_fprop.h | 622 + .../conv/kernel/default_conv2d_wgrad.h | 1011 + .../conv/kernel/default_conv2d_wgrad_fusion.h | 325 + .../conv/kernel/default_conv3d_dgrad.h | 736 + .../conv/kernel/default_conv3d_fprop.h | 981 + .../conv/kernel/default_conv3d_fprop_fusion.h | 360 + .../default_conv3d_fprop_with_broadcast.h | 222 + .../conv/kernel/default_conv3d_wgrad.h | 936 + .../cutlass/conv/kernel/default_deconv2d.h | 999 + .../kernel/default_deconv2d_with_broadcast.h | 305 + .../cutlass/conv/kernel/default_deconv3d.h | 541 + .../kernel/default_deconv3d_with_broadcast.h | 309 + .../conv/kernel/default_depthwise_fprop.h | 588 + .../cutlass/conv/kernel/direct_convolution.h | 505 + .../conv/kernel/implicit_gemm_convolution.h | 455 + .../kernel/implicit_gemm_convolution_fusion.h | 461 + .../implicit_gemm_convolution_strided_dgrad.h | 492 + .../implicit_gemm_convolution_with_absmax.h | 494 + ...cit_gemm_convolution_with_fused_epilogue.h | 499 + ...sm90_implicit_gemm_tma_warpspecialized.hpp | 76 + .../cutlass/conv/thread/depthwise_mma.h | 325 + ...rad_filter_tile_access_iterator_analytic.h | 485 + ...ad_filter_tile_access_iterator_optimized.h | 619 + ...t_gradient_tile_access_iterator_analytic.h | 606 + ..._gradient_tile_access_iterator_optimized.h | 821 + ...activation_tile_access_iterator_analytic.h | 332 + ...vation_tile_access_iterator_few_channels.h | 360 + ...tion_tile_access_iterator_fixed_channels.h | 353 + ...ctivation_tile_access_iterator_optimized.h | 422 + ...rop_filter_tile_access_iterator_analytic.h | 330 + ...filter_tile_access_iterator_few_channels.h | 289 + ...lter_tile_access_iterator_fixed_channels.h | 275 + ...op_filter_tile_access_iterator_optimized.h | 322 + .../cutlass/conv/threadblock/conv2d_params.h | 893 + .../conv/threadblock/conv2d_tile_iterator.h | 337 + ...activation_tile_access_iterator_analytic.h | 285 + ...ctivation_tile_access_iterator_optimized.h | 321 + ...t_gradient_tile_access_iterator_analytic.h | 260 + ..._gradient_tile_access_iterator_optimized.h | 310 + ...rad_filter_tile_access_iterator_analytic.h | 268 + ...ad_filter_tile_access_iterator_optimized.h | 289 + ...t_gradient_tile_access_iterator_analytic.h | 343 + ..._gradient_tile_access_iterator_optimized.h | 489 + ...activation_tile_access_iterator_analytic.h | 291 + ...ctivation_tile_access_iterator_optimized.h | 478 + ...rop_filter_tile_access_iterator_analytic.h | 259 + ...op_filter_tile_access_iterator_optimized.h | 279 + .../cutlass/conv/threadblock/conv3d_params.h | 508 + ...activation_tile_access_iterator_analytic.h | 289 + ...ctivation_tile_access_iterator_optimized.h | 319 + ...t_gradient_tile_access_iterator_analytic.h | 267 + ..._gradient_tile_access_iterator_optimized.h | 310 + .../depthwise_direct_conv_params.h | 230 + ...erator_direct_conv_fixed_stride_dilation.h | 314 + ...le_access_iterator_direct_conv_optimized.h | 291 + .../depthwise_fprop_direct_conv_multistage.h | 551 + ...le_access_iterator_direct_conv_optimized.h | 261 + .../threadblock/depthwise_fprop_pipelined.h | 336 + .../conv/threadblock/depthwise_mma_base.h | 229 + ...depthwise_mma_core_with_lane_access_size.h | 952 + .../implicit_gemm_fprop_fusion_multistage.h | 802 + .../threadblock/implicit_gemm_multistage.h | 539 + .../threadblock/implicit_gemm_pipelined.h | 320 + .../implicit_gemm_wgrad_fusion_multistage.h | 729 + ...icated_scale_bias_vector_access_iterator.h | 470 + .../predicated_scale_bias_vector_iterator.h | 371 + .../conv/threadblock/threadblock_swizzle.h | 193 + .../cutlass/conv/warp/mma_depthwise_simt.h | 380 + .../warp/mma_depthwise_simt_tile_iterator.h | 862 + .../conv/warp/scale_bias_relu_transform.h | 221 + .../cutlass/include/cutlass/coord.h | 480 + .../cutlass/include/cutlass/core_io.h | 286 + .../include/cutlass/cuda_host_adapter.hpp | 407 + .../cutlass/include/cutlass/cutlass.h | 160 + .../include/cutlass/detail/collective.hpp | 63 + .../cutlass/detail/dependent_false.hpp | 86 + .../include/cutlass/detail/helper_macros.hpp | 205 + .../cutlass/include/cutlass/detail/layout.hpp | 406 + .../cutlass/include/cutlass/detail/mma.hpp | 71 + .../cutlass/include/cutlass/device_kernel.h | 125 + .../collective/builders/sm90_builder.inl | 812 + .../collective/builders/sm90_common.inl | 80 + .../collective/collective_builder.hpp | 120 + .../collective/collective_epilogue.hpp | 71 + .../epilogue/collective/default_epilogue.hpp | 242 + .../collective/default_epilogue_array.hpp | 273 + .../cutlass/epilogue/collective/detail.hpp | 491 + .../collective/epilogue_tensor_broadcast.hpp | 271 + .../collective/sm70_epilogue_vectorized.hpp | 549 + .../sm70_epilogue_vectorized_array.hpp | 412 + ...m90_epilogue_array_tma_warpspecialized.hpp | 1191 + .../sm90_epilogue_tma_warpspecialized.hpp | 904 + ...e_tma_warpspecialized_bias_elementwise.hpp | 164 + .../cutlass/epilogue/dispatch_policy.hpp | 195 + .../cutlass/epilogue/fusion/callbacks.hpp | 89 + .../cutlass/epilogue/fusion/operations.hpp | 351 + .../sm90_callbacks_tma_warpspecialized.hpp | 1787 + ...90_visitor_compute_tma_warpspecialized.hpp | 839 + .../sm90_visitor_load_tma_warpspecialized.hpp | 1415 + ...sm90_visitor_store_tma_warpspecialized.hpp | 1736 + .../sm90_visitor_tma_warpspecialized.hpp | 1139 + .../fusion/sm90_visitor_topk_softmax.hpp | 759 + .../cutlass/epilogue/thread/activation.h | 758 + .../cutlass/epilogue/thread/conversion_op.h | 132 + .../cutlass/epilogue/thread/detail.hpp | 52 + .../epilogue/thread/linear_combination.h | 523 + .../linear_combination_bias_elementwise.h | 524 + .../thread/linear_combination_bias_relu.h | 610 + .../thread/linear_combination_clamp.h | 685 + .../thread/linear_combination_dgelu.h | 250 + .../thread/linear_combination_drelu.h | 452 + .../epilogue/thread/linear_combination_gelu.h | 70 + .../thread/linear_combination_generic.h | 265 + .../linear_combination_generic_with_scaling.h | 325 + .../thread/linear_combination_hardswish.h | 69 + .../thread/linear_combination_leaky_relu.h | 231 + .../thread/linear_combination_params.h | 75 + .../linear_combination_planar_complex.h | 236 + .../epilogue/thread/linear_combination_relu.h | 572 + .../thread/linear_combination_relu0.h | 543 + .../linear_combination_residual_block.h | 301 + .../thread/linear_combination_sigmoid.h | 70 + .../epilogue/thread/linear_combination_silu.h | 69 + .../linear_combination_tensor_broadcast.hpp | 253 + .../linear_combination_with_elementwise.h | 234 + .../cutlass/epilogue/thread/reduction_op.h | 97 + .../cutlass/epilogue/thread/scale_type.h | 66 + .../default_epilogue_complex_tensor_op.h | 255 + ...default_epilogue_complex_tensor_op_blas3.h | 264 + .../default_epilogue_direct_store.h | 74 + .../default_epilogue_planar_complex.h | 241 + .../threadblock/default_epilogue_simt.h | 443 + .../threadblock/default_epilogue_tensor_op.h | 904 + .../default_epilogue_tensor_op_blas3.h | 175 + .../default_epilogue_volta_tensor_op.h | 337 + .../default_epilogue_with_absmax.h | 126 + .../default_epilogue_with_broadcast.h | 376 + .../default_epilogue_with_reduction.h | 177 + .../default_epilogue_wmma_tensor_op.h | 165 + .../threadblock/default_thread_map_simt.h | 127 + .../default_thread_map_tensor_op.h | 208 + .../default_thread_map_volta_tensor_op.h | 228 + .../default_thread_map_wmma_tensor_op.h | 113 + .../direct_store_epilogue_iterator.h | 142 + .../cutlass/epilogue/threadblock/epilogue.h | 543 + .../epilogue/threadblock/epilogue_base.h | 240 + .../threadblock/epilogue_base_streamk.h | 197 + .../epilogue/threadblock/epilogue_depthwise.h | 335 + .../threadblock/epilogue_direct_store.h | 347 + .../threadblock/epilogue_gemm_k_reduction.h | 212 + .../threadblock/epilogue_planar_complex.h | 401 + .../threadblock/epilogue_smem_accumulator.h | 230 + .../epilogue_streamk_with_broadcast.h | 443 + .../epilogue_visitor_with_softmax.h | 513 + .../threadblock/epilogue_with_absmax.h | 923 + .../threadblock/epilogue_with_broadcast.h | 1718 + .../threadblock/epilogue_with_reduction.h | 823 + .../threadblock/epilogue_with_visitor.h | 409 + .../epilogue_with_visitor_callbacks.h | 504 + .../epilogue/threadblock/epilogue_workspace.h | 197 + .../threadblock/fusion/visitor_2x.hpp | 433 + .../threadblock/fusion/visitor_compute.hpp | 109 + .../threadblock/fusion/visitor_load.hpp | 583 + .../threadblock/fusion/visitor_store.hpp | 805 + .../epilogue/threadblock/fusion/visitors.hpp | 38 + .../threadblock/interleaved_epilogue.h | 407 + .../threadblock/output_iterator_parameter.h | 223 + .../threadblock/output_tile_thread_map.h | 628 + .../threadblock/predicated_tile_iterator.h | 1387 + .../predicated_tile_iterator_affine.h | 615 + ...cated_tile_iterator_affine_layout_params.h | 156 + .../predicated_tile_iterator_blas3.h | 633 + .../predicated_tile_iterator_conv.h | 562 + .../predicated_tile_iterator_direct_conv.h | 445 + .../predicated_tile_iterator_params.h | 483 + .../predicated_tile_iterator_predicates.h | 309 + .../predicated_tile_iterator_strided_dgrad.h | 479 + .../threadblock/shared_load_iterator.h | 223 + .../threadblock/shared_load_iterator_mixed.h | 594 + .../shared_load_iterator_pitch_linear.h | 194 + .../fragment_iterator_complex_tensor_op.h | 187 + ...ment_iterator_gaussian_complex_tensor_op.h | 194 + .../epilogue/warp/fragment_iterator_simt.h | 164 + .../warp/fragment_iterator_tensor_op.h | 378 + .../warp/fragment_iterator_volta_tensor_op.h | 269 + .../warp/fragment_iterator_wmma_tensor_op.h | 164 + .../cutlass/epilogue/warp/simt_policy.h | 107 + .../cutlass/epilogue/warp/tensor_op_policy.h | 189 + .../epilogue/warp/tile_iterator_simt.h | 785 + .../epilogue/warp/tile_iterator_tensor_op.h | 671 + .../warp/tile_iterator_tensor_op_mixed.h | 1081 + .../warp/tile_iterator_volta_tensor_op.h | 440 + .../warp/tile_iterator_wmma_tensor_op.h | 227 + .../epilogue/warp/volta_tensor_op_policy.h | 195 + .../epilogue/warp/wmma_tensor_op_policy.h | 101 + .../cutlass/include/cutlass/fast_math.h | 1067 + .../cutlass/include/cutlass/float8.h | 1284 + .../include/cutlass/floating_point_nvrtc.h | 98 + .../cutlass/include/cutlass/functional.h | 930 + .../gemm/collective/builders/sm90_common.inl | 419 + .../collective/builders/sm90_gmma_builder.inl | 1048 + .../builders/sm90_sparse_config.inl | 268 + .../builders/sm90_sparse_gmma_builder.inl | 388 + .../gemm/collective/collective_builder.hpp | 42 + .../collective/collective_builder_decl.hpp | 88 + .../gemm/collective/collective_mma.hpp | 49 + .../gemm/collective/collective_mma_decl.hpp | 64 + .../gemm/collective/fp8_accumulation.hpp | 121 + .../gemm/collective/sm70_mma_twostage.hpp | 597 + .../gemm/collective/sm80_mma_multistage.hpp | 707 + ..._mma_array_tma_gmma_ss_warpspecialized.hpp | 759 + ...mma_multistage_gmma_rs_warpspecialized.hpp | 677 + ...mma_multistage_gmma_ss_warpspecialized.hpp | 509 + .../sm90_mma_tma_gmma_rs_warpspecialized.hpp | 752 + ...ma_gmma_rs_warpspecialized_mixed_input.hpp | 1560 + .../gemm/collective/sm90_mma_tma_gmma_ss.hpp | 539 + .../sm90_mma_tma_gmma_ss_warpspecialized.hpp | 582 + ...90_mma_tma_gmma_ss_warpspecialized_fp8.hpp | 584 + ...sparse_mma_tma_gmma_ss_warpspecialized.hpp | 724 + .../cutlass/gemm/device/base_grouped.h | 478 + .../gemm/device/default_gemm_configuration.h | 955 + .../include/cutlass/gemm/device/ell_gemm.h | 849 + .../include/cutlass/gemm/device/gemm.h | 772 + .../include/cutlass/gemm/device/gemm_array.h | 738 + .../cutlass/gemm/device/gemm_batched.h | 704 + .../cutlass/gemm/device/gemm_complex.h | 718 + .../cutlass/gemm/device/gemm_grouped.h | 61 + .../device/gemm_layernorm_mainloop_fusion.h | 385 + .../include/cutlass/gemm/device/gemm_sparse.h | 515 + .../gemm/device/gemm_sparse_universal.h | 211 + .../gemm_sparse_universal_with_absmax.h | 202 + .../gemm/device/gemm_sparse_with_absmax.h | 360 + .../gemm/device/gemm_sparse_with_visitor.h | 342 + .../gemm/device/gemm_splitk_parallel.h | 636 + .../cutlass/gemm/device/gemm_universal.h | 442 + .../gemm/device/gemm_universal_adapter.h | 693 + .../cutlass/gemm/device/gemm_universal_base.h | 522 + .../gemm_universal_streamk_with_broadcast.h | 386 + .../gemm/device/gemm_universal_with_absmax.h | 404 + .../device/gemm_universal_with_broadcast.h | 386 + .../gemm/device/gemm_with_k_reduction.h | 415 + .../include/cutlass/gemm/device/gemv.h | 182 + .../include/cutlass/gemm/device/rank_2k.h | 548 + .../cutlass/gemm/device/rank_2k_grouped.h | 63 + .../include/cutlass/gemm/device/rank_k.h | 510 + .../include/cutlass/gemm/device/symm.h | 603 + .../include/cutlass/gemm/device/trmm.h | 759 + .../include/cutlass/gemm/dispatch_policy.hpp | 324 + .../cutlass/include/cutlass/gemm/gemm.h | 133 + .../cutlass/gemm/gemm_enumerated_types.h | 80 + .../gemm/group_array_problem_shape.hpp | 123 + .../cutlass/gemm/kernel/default_ell_gemm.h | 837 + .../cutlass/gemm/kernel/default_gemm.h | 1189 + .../gemm/kernel/default_gemm_complex.h | 404 + .../gemm/kernel/default_gemm_grouped.h | 384 + ...ult_gemm_grouped_softmax_mainloop_fusion.h | 164 + .../default_gemm_layernorm_mainloop_fusion.h | 137 + .../default_gemm_planar_complex_universal.h | 352 + .../cutlass/gemm/kernel/default_gemm_sparse.h | 252 + .../kernel/default_gemm_sparse_universal.h | 141 + ...efault_gemm_sparse_universal_with_absmax.h | 144 + .../kernel/default_gemm_sparse_with_absmax.h | 157 + .../kernel/default_gemm_sparse_with_visitor.h | 197 + .../kernel/default_gemm_splitk_parallel.h | 136 + .../default_gemm_streamk_with_broadcast.h | 146 + .../gemm/kernel/default_gemm_universal.h | 396 + .../default_gemm_universal_with_visitor.h | 157 + .../gemm/kernel/default_gemm_with_absmax.h | 143 + .../gemm/kernel/default_gemm_with_broadcast.h | 243 + .../kernel/default_gemm_with_k_reduction.h | 150 + .../gemm/kernel/default_gemm_with_reduction.h | 246 + .../cutlass/gemm/kernel/default_gemv.h | 132 + .../cutlass/gemm/kernel/default_rank_2k.h | 285 + .../gemm/kernel/default_rank_2k_complex.h | 498 + .../gemm/kernel/default_rank_2k_grouped.h | 355 + .../gemm/kernel/default_rank_2k_universal.h | 346 + .../cutlass/gemm/kernel/default_rank_k.h | 247 + .../gemm/kernel/default_rank_k_complex.h | 429 + .../gemm/kernel/default_rank_k_universal.h | 305 + .../cutlass/gemm/kernel/default_symm.h | 321 + .../gemm/kernel/default_symm_complex.h | 508 + .../gemm/kernel/default_symm_universal.h | 342 + .../cutlass/gemm/kernel/default_trmm.h | 269 + .../gemm/kernel/default_trmm_complex.h | 265 + .../gemm/kernel/default_trmm_universal.h | 359 + .../include/cutlass/gemm/kernel/ell_gemm.h | 824 + .../include/cutlass/gemm/kernel/gemm.h | 380 + .../include/cutlass/gemm/kernel/gemm_array.h | 264 + .../cutlass/gemm/kernel/gemm_batched.h | 273 + .../cutlass/gemm/kernel/gemm_grouped.h | 457 + .../kernel/gemm_grouped_problem_visitor.h | 121 + .../gemm_grouped_softmax_mainloop_fusion.h | 481 + .../kernel/gemm_layernorm_mainloop_fusion.h | 782 + .../include/cutlass/gemm/kernel/gemm_params.h | 189 + .../cutlass/gemm/kernel/gemm_pipelined.h | 158 + .../cutlass/gemm/kernel/gemm_planar_complex.h | 715 + .../gemm/kernel/gemm_planar_complex_array.h | 609 + .../gemm/kernel/gemm_sparse_universal.h | 804 + .../gemm_sparse_universal_with_absmax.h | 609 + .../gemm/kernel/gemm_splitk_parallel.h | 253 + .../kernel/gemm_streamk_with_fused_epilogue.h | 2396 + .../gemm/kernel/gemm_transpose_operands.h | 124 + .../cutlass/gemm/kernel/gemm_universal.h | 702 + .../cutlass/gemm/kernel/gemm_universal.hpp | 66 + .../cutlass/gemm/kernel/gemm_universal_decl.h | 61 + .../gemm/kernel/gemm_universal_streamk.h | 1168 + .../gemm/kernel/gemm_universal_with_visitor.h | 321 + .../gemm_universal_with_visitor_streamk.h | 895 + .../cutlass/gemm/kernel/gemm_with_absmax.h | 759 + .../gemm/kernel/gemm_with_fused_epilogue.h | 1512 + .../gemm/kernel/gemm_with_k_reduction.h | 704 + .../include/cutlass/gemm/kernel/gemv.h | 638 + .../gemm/kernel/gemv_batched_strided.h | 244 + .../gemm/kernel/grouped_problem_visitor.h | 463 + .../cutlass/gemm/kernel/params_sparse_base.h | 115 + .../gemm/kernel/params_universal_base.h | 264 + .../cutlass/gemm/kernel/rank_2k_grouped.h | 688 + .../kernel/rank_2k_grouped_problem_visitor.h | 376 + .../gemm/kernel/rank_2k_transpose_operands.h | 129 + .../cutlass/gemm/kernel/rank_2k_universal.h | 769 + .../cutlass/gemm/kernel/rank_k_universal.h | 556 + .../include/cutlass/gemm/kernel/sm70_gemm.hpp | 270 + ..._array_tma_warpspecialized_cooperative.hpp | 881 + ...emm_array_tma_warpspecialized_pingpong.hpp | 946 + .../cutlass/gemm/kernel/sm90_gemm_tma.hpp | 306 + .../kernel/sm90_gemm_tma_warpspecialized.hpp | 522 + ...0_gemm_tma_warpspecialized_cooperative.hpp | 671 + ...sm90_gemm_tma_warpspecialized_pingpong.hpp | 664 + .../gemm/kernel/sm90_gemm_warpspecialized.hpp | 417 + .../sm90_gemm_warpspecialized_cooperative.hpp | 504 + .../sm90_gemm_warpspecialized_pingpong.hpp | 516 + .../gemm/kernel/sm90_tile_scheduler.hpp | 139 + .../gemm/kernel/sm90_tile_scheduler_group.hpp | 510 + .../kernel/sm90_tile_scheduler_stream_k.hpp | 960 + .../include/cutlass/gemm/kernel/sparse_gemm.h | 394 + .../gemm/kernel/sparse_gemm_with_absmax.h | 509 + .../gemm/kernel/sparse_gemm_with_visitor.h | 238 + .../gemm/kernel/static_tile_scheduler.hpp | 502 + .../cutlass/gemm/kernel/symm_universal.h | 675 + .../cutlass/gemm/kernel/tile_scheduler.hpp | 149 + .../gemm/kernel/tile_scheduler_params.h | 1535 + .../cutlass/gemm/kernel/trmm_universal.h | 580 + .../cutlass/include/cutlass/gemm/thread/mma.h | 90 + .../include/cutlass/gemm/thread/mma_sm50.h | 538 + .../include/cutlass/gemm/thread/mma_sm60.h | 1161 + .../include/cutlass/gemm/thread/mma_sm61.h | 284 + .../gemm/threadblock/default_ell_mma.h | 734 + .../gemm/threadblock/default_gemv_core.h | 151 + .../cutlass/gemm/threadblock/default_mma.h | 823 + .../gemm/threadblock/default_mma_core.h | 116 + .../gemm/threadblock/default_mma_core_simt.h | 1723 + .../gemm/threadblock/default_mma_core_sm70.h | 682 + .../gemm/threadblock/default_mma_core_sm75.h | 1315 + .../gemm/threadblock/default_mma_core_sm80.h | 2951 + .../default_mma_core_sparse_sm80.h | 876 + .../default_mma_core_with_access_size.h | 328 + .../default_mma_core_with_reduction.h | 167 + .../gemm/threadblock/default_mma_core_wmma.h | 712 + .../default_mma_layernorm_mainloop_fusion.h | 178 + .../default_mma_planar_complex_multistage.h | 136 + .../default_mma_planar_complex_pipelined.h | 130 + .../default_mma_softmax_mainloop_fusion.h | 160 + .../threadblock/default_mma_with_reduction.h | 141 + .../default_multistage_mma_complex.h | 159 + .../default_multistage_mma_complex_core.h | 119 + ...default_multistage_mma_complex_core_sm80.h | 1808 + .../default_multistage_trmm_complex.h | 556 + .../gemm/threadblock/default_sparse_mma.h | 196 + .../cutlass/gemm/threadblock/default_trmm.h | 445 + .../gemm/threadblock/ell_mma_multistage.h | 648 + .../gemm/threadblock/ell_mma_pipelined.h | 376 + .../include/cutlass/gemm/threadblock/gemv.h | 147 + .../cutlass/gemm/threadblock/index_remat.h | 107 + .../cutlass/gemm/threadblock/mma_base.h | 236 + .../gemm/threadblock/mma_blas3_multistage.h | 707 + ...mma_layernorm_mainloop_fusion_multistage.h | 863 + .../cutlass/gemm/threadblock/mma_multistage.h | 741 + .../cutlass/gemm/threadblock/mma_pipelined.h | 439 + .../threadblock/mma_planar_complex_base.h | 208 + .../mma_planar_complex_multistage.h | 646 + .../mma_planar_complex_pipelined.h | 424 + .../gemm/threadblock/mma_singlestage.h | 265 + .../mma_softmax_mainloop_fusion_multistage.h | 756 + .../gemm/threadblock/mma_sparse_base.h | 273 + .../gemm/threadblock/mma_sparse_multistage.h | 668 + .../mma_with_reduction_multistage.h | 545 + .../gemm/threadblock/threadblock_swizzle.h | 459 + .../threadblock/threadblock_swizzle_streamk.h | 801 + .../gemm/warp/default_mma_complex_tensor_op.h | 612 + .../gemm/warp/default_mma_sparse_tensor_op.h | 165 + .../cutlass/gemm/warp/default_mma_tensor_op.h | 123 + .../gemm/warp/default_mma_tensor_op_sm80.h | 375 + .../default_mma_with_reduction_tensor_op.h | 92 + .../gemm/warp/default_mma_wmma_tensor_op.h | 130 + .../warp/layernorm_scale_bias_transform.h | 139 + .../cutlass/include/cutlass/gemm/warp/mma.h | 60 + .../cutlass/gemm/warp/mma_complex_tensor_op.h | 1168 + .../warp/mma_complex_tensor_op_fast_f32.h | 663 + ...mma_complex_tensor_op_tile_iterator_sm80.h | 2485 + .../warp/mma_gaussian_complex_tensor_op.h | 642 + ...ian_complex_tensor_op_tile_iterator_sm80.h | 390 + .../gemm/warp/mma_mixed_input_tensor_op.h | 566 + .../cutlass/gemm/warp/mma_planar_complex.h | 182 + .../include/cutlass/gemm/warp/mma_simt.h | 263 + .../cutlass/gemm/warp/mma_simt_policy.h | 69 + .../gemm/warp/mma_simt_tile_iterator.h | 1890 + .../cutlass/gemm/warp/mma_sparse_tensor_op.h | 382 + .../include/cutlass/gemm/warp/mma_tensor_op.h | 415 + .../gemm/warp/mma_tensor_op_fast_f32.h | 471 + .../warp/mma_tensor_op_fragment_iterator.h | 559 + .../cutlass/gemm/warp/mma_tensor_op_policy.h | 65 + .../cutlass/gemm/warp/mma_tensor_op_sm70.h | 280 + .../warp/mma_tensor_op_tile_access_iterator.h | 362 + .../gemm/warp/mma_tensor_op_tile_iterator.h | 4803 ++ .../warp/mma_tensor_op_tile_iterator_sm70.h | 3098 + .../warp/mma_tensor_op_tile_iterator_sm80.h | 2441 + .../warp/mma_tensor_op_tile_iterator_sparse.h | 380 + .../warp/mma_tensor_op_tile_iterator_wmma.h | 805 + .../cutlass/gemm/warp/mma_tensor_op_wmma.h | 223 + .../gemm/warp/mma_with_reduction_tensor_op.h | 449 + .../gemm/warp/scale_bias_tile_iterator.h | 572 + .../gemm/warp/softmax_scale_bias_transform.h | 117 + .../gemm/warp/tile_iterator_planar_complex.h | 250 + .../cutlass/include/cutlass/gemm_coord.h | 394 + .../cutlass/include/cutlass/gemm_coord.hpp | 66 + .../cutlass/include/cutlass/half.h | 930 + .../cutlass/include/cutlass/integer_subbyte.h | 280 + .../include/cutlass/kernel_hardware_info.h | 76 + .../include/cutlass/kernel_hardware_info.hpp | 35 + .../cutlass/include/cutlass/kernel_launch.h | 141 + .../cutlass/include/cutlass/layout/layout.h | 64 + .../cutlass/include/cutlass/layout/matrix.h | 1349 + .../cutlass/include/cutlass/layout/permute.h | 828 + .../include/cutlass/layout/pitch_linear.h | 149 + .../cutlass/include/cutlass/layout/tensor.h | 648 + .../layout/tensor_op_multiplicand_sm70.h | 1044 + .../layout/tensor_op_multiplicand_sm75.h | 1169 + .../layout/tensor_op_multiplicand_sm80.h | 1139 + .../cutlass/include/cutlass/layout/vector.h | 105 + .../cutlass/include/cutlass/matrix.h | 14129 ++++ .../cutlass/include/cutlass/matrix_coord.h | 164 + .../cutlass/include/cutlass/matrix_shape.h | 65 + .../include/cutlass/numeric_conversion.h | 4547 ++ .../cutlass/include/cutlass/numeric_size.h | 83 + .../cutlass/include/cutlass/numeric_types.h | 88 + .../include/cutlass/pipeline/pipeline.hpp | 36 + .../cutlass/pipeline/sm90_pipeline.hpp | 1173 + .../include/cutlass/pitch_linear_coord.h | 181 + .../include/cutlass/platform/platform.h | 913 + .../include/cutlass/predicate_vector.h | 547 + .../cutlass/include/cutlass/quaternion.h | 752 + .../cutlass/include/cutlass/real.h | 61 + .../cutlass/reduction/device/reduce_split_k.h | 232 + .../cutlass/reduction/device/tensor_reduce.h | 264 + .../device/tensor_reduce_affine_contiguous.h | 374 + .../device/tensor_reduce_affine_strided.h | 362 + .../reduction/kernel/reduce_softmax_final.h | 267 + .../cutlass/reduction/kernel/reduce_split_k.h | 248 + .../kernel/tensor_reduce_affine_contiguous.h | 606 + .../kernel/tensor_reduce_affine_strided.h | 641 + .../include/cutlass/reduction/thread/reduce.h | 234 + .../reduction/thread/reduction_operators.h | 235 + .../cutlass/reduction/threadblock_swizzle.h | 67 + .../include/cutlass/relatively_equal.h | 275 + .../cutlass/include/cutlass/semaphore.h | 118 + .../include/cutlass/subbyte_reference.h | 1388 + .../cutlass/include/cutlass/tensor_coord.h | 326 + .../cutlass/include/cutlass/tensor_ref.h | 419 + .../cutlass/tensor_ref_planar_complex.h | 374 + .../cutlass/include/cutlass/tensor_view.h | 297 + .../cutlass/tensor_view_planar_complex.h | 301 + .../cutlass/include/cutlass/tfloat32.h | 478 + .../cutlass/include/cutlass/thread/matrix.h | 198 + .../cutlass/include/cutlass/trace.h | 59 + .../collective/sm90_wgmma_transpose.hpp | 754 + .../device/transform_universal_adapter.hpp | 303 + .../kernel/filter_format_transformer.hpp | 223 + .../kernel/sm90_sparse_gemm_compressor.hpp | 578 + .../kernel/sparse_gemm_compressor.hpp | 284 + .../transform/pitch_linear_thread_map.h | 926 + .../cutlass/transform/thread/transpose.h | 107 + .../cutlass/transform/thread/unary_op.h | 105 + .../transform/threadblock/ell_iterator.h | 199 + .../ell_predicated_tile_access_iterator.h | 1350 + .../ell_predicated_tile_iterator.h | 1315 + ...icated_scale_bias_vector_access_iterator.h | 375 + .../predicated_scale_bias_vector_iterator.h | 328 + .../predicated_tile_access_iterator.h | 2118 + ...icated_tile_access_iterator_2dthreadtile.h | 834 + .../predicated_tile_access_iterator_params.h | 290 + ...d_tile_access_iterator_triangular_matrix.h | 892 + .../threadblock/predicated_tile_iterator.h | 1887 + .../predicated_tile_iterator_2dthreadtile.h | 787 + ...edicated_tile_iterator_triangular_matrix.h | 818 + .../predicated_vector_access_iterator.h | 417 + ...egular_scale_bias_vector_access_iterator.h | 253 + .../regular_tile_access_iterator.h | 58 + ...egular_tile_access_iterator_pitch_linear.h | 408 + ...access_iterator_pitch_linear_direct_conv.h | 587 + .../regular_tile_access_iterator_tensor_op.h | 821 + ...ular_tile_access_iterator_tensor_op_sm80.h | 1532 + .../threadblock/regular_tile_iterator.h | 62 + .../regular_tile_iterator_pitch_linear.h | 552 + ..._tile_iterator_pitch_linear_2dthreadtile.h | 509 + .../regular_tile_iterator_tensor_op.h | 1107 + .../regular_tile_iterator_tensor_op_sm70.h | 1460 + .../transform/threadblock/vector_iterator.h | 149 + .../transform/warp/vector_fragment_iterator.h | 283 + .../cutlass/include/cutlass/uint128.h | 270 + .../cutlass/include/cutlass/version.h | 80 + .../cutlass/include/cutlass/wmma_array.h | 133 + .../cutlass/include/cutlass/workspace.h | 150 + .../include/cutlass_extensions/common.hpp | 48 + .../epilogue/broadcast_load_epilogue_c3x.hpp | 447 + .../epilogue/scaled_mm_epilogues_c3x.hpp | 286 + lightllm-kernel/include/ops_common.h | 66 +- lightllm-kernel/include/reduce/sm70.cuh | 191 + lightllm-kernel/include/utils.h | 267 + .../lightllm_kernel/ops/__init__.py | 51 +- lightllm-kernel/lightllm_kernel/ops/fusion.py | 22 + lightllm-kernel/lightllm_kernel/ops/gemm.py | 8 + lightllm-kernel/lightllm_kernel/ops/norm.py | 7 + lightllm-kernel/lightllm_kernel/ops/quant.py | 10 + lightllm-kernel/setup.py | 45 +- lightllm-kernel/test/__init__.py | 0 .../test/fusion/add_norm_quant_test.py | 70 + .../test/fusion/gelu_per_token_quant_test.py | 50 + .../test/fusion/post_tp_norm_test.py | 54 + .../test/fusion/pre_tp_norm_test.py | 46 + .../test/gemm/cutlass_scaled_mm_test.py | 80 + lightllm-kernel/test/norm/rmsnorm_test.py | 45 + lightllm-kernel/test/quant/quant_test.py | 47 + lightllm-kernel/test/utils.py | 125 + 702 files changed, 554067 insertions(+), 112 deletions(-) create mode 100755 lightllm-kernel/csrc/fusion/add_norm_quant.cu create mode 100755 lightllm-kernel/csrc/fusion/gelu_per_token_quant.cu create mode 100755 lightllm-kernel/csrc/fusion/post_tp_norm.cu create mode 100755 lightllm-kernel/csrc/fusion/pre_tp_norm.cu create mode 100755 lightllm-kernel/csrc/gemm/Epilogues.md create mode 100755 lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu create mode 100755 lightllm-kernel/csrc/gemm/scaled_mm_c3x.cuh create mode 100755 lightllm-kernel/csrc/gemm/scaled_mm_c3x_sm90_fp8_dispatch.cuh create mode 100755 lightllm-kernel/csrc/gemm/scaled_mm_entry.cu delete mode 100644 lightllm-kernel/csrc/moe/grouped_topk_interface.cpp create mode 100755 lightllm-kernel/csrc/norm/rmsnorm_bf16.cu create mode 100755 lightllm-kernel/csrc/quant/per_token_quantize_bf16.cu create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/axpby.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/clear.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/cooperative_copy.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/cooperative_gemm.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/copy.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/fill.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/functional.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/gemm.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/prefer.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/prefetch.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/tensor_algorithms.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/algorithm/tuple_algorithms.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/cluster_sm90.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/config.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm50.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm75.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm80.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm90.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm90_desc.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/copy_sm90_tma.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm61.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm70.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm75.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm80.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_desc.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_ext.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/arch/util.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_atom.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm50.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm75.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm80.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_im2col.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_atom.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm61.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm70.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm75.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm80.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_ext.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/config.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/container/alignment.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/container/array.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/container/array_aligned.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/container/array_subbyte.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/container/bit_field.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/container/cuda_types.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/container/packed_tuple.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/container/tuple.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/container/type_list.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/int_tuple.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/layout.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/layout_composed.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/arithmetic_tuple.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/complex.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/int.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/integer_sequence.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/integral_constant.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/integral_ratio.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/math.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/numeric_types.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/numeric/real.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/pointer.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/pointer_base.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/pointer_flagged.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/pointer_sparse.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/pointer_swizzle.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/stride.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/swizzle.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/swizzle_layout.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/tensor.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/tensor_impl.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/tensor_predicate.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/tensor_zip.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/underscore.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/util/debug.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/util/print.hpp create mode 100755 lightllm-kernel/cutlass/include/cute/util/type_traits.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/aligned_buffer.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/arch.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/barrier.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/cache_operation.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/config.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/grid_dependency_control.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/memory.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/memory_sm75.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/memory_sm80.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm50.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm60.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm61.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm70.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm75.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm80.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm89.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sm90.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm80.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/mma_sparse_sm89.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/reg_reconfig.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/simd.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/simd_sm60.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/simd_sm61.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/synclog.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/wmma.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm70.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm72.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/arch/wmma_sm75.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/array.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/array_planar_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/array_subbyte.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/barrier.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/bfloat16.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/blas3.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/blas3_types.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/block_striped.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/cluster_launch.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/constants.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_common.inl create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_builder.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/collective_conv.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/detail.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/conv2d_problem_size.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/conv3d_problem_size.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/convnd_problem_shape.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/convolution.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/detail.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/device/direct_convolution.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/dispatch_policy.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/conv_universal.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_group_fprop.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_dgrad.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_conv3d_wgrad.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/default_depthwise_fprop.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/direct_convolution.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/thread/depthwise_mma.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_params.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_tile_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_params.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_base.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/threadblock/threadblock_swizzle.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/conv/warp/scale_bias_relu_transform.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/coord.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/core_io.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/cuda_host_adapter.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/cutlass.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/collective.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/dependent_false.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/helper_macros.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/layout.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/detail/mma.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/device_kernel.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_builder.inl create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/builders/sm90_common.inl create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_builder.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/collective_epilogue.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/default_epilogue_array.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/detail.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/dispatch_policy.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/callbacks.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/operations.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/activation.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/conversion_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/detail.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/reduction_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/thread/scale_type.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/epilogue_workspace.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/fusion/visitors.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/interleaved_epilogue.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_iterator_parameter.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_simt.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/simt_policy.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tensor_op_policy.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/volta_tensor_op_policy.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/fast_math.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/float8.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/floating_point_nvrtc.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/functional.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_common.inl create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_builder_decl.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/collective_mma_decl.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/fp8_accumulation.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm70_mma_twostage.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm80_mma_multistage.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/base_grouped.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/default_gemm_configuration.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/ell_gemm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_array.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_batched.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_grouped.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_sparse_with_visitor.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_splitk_parallel.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_adapter.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_base.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_universal_with_broadcast.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemm_with_k_reduction.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/gemv.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_2k_grouped.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/rank_k.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/symm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/device/trmm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/dispatch_policy.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/gemm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/gemm_enumerated_types.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/group_array_problem_shape.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_ell_gemm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemm_with_reduction.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_gemv.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_grouped.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_2k_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_rank_k_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_symm_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/default_trmm_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/ell_gemm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_array.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_batched.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_params.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_pipelined.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex_array.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_splitk_parallel.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_transpose_operands.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_decl.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_streamk.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemm_with_k_reduction.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/gemv_batched_strided.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/grouped_problem_visitor.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_sparse_base.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/params_universal_base.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_2k_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/rank_k_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm70_gemm.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/static_tile_scheduler.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/symm_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/tile_scheduler_params.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/kernel/trmm_universal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm50.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm60.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/thread/mma_sm61.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_ell_mma.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_gemv_core.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_simt.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm70.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm75.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm80.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_core_wmma.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_mma_with_reduction.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_sparse_mma.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/default_trmm.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/ell_mma_pipelined.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/gemv.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/index_remat.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_base.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_blas3_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_pipelined.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_base.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_singlestage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_base.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_sparse_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_planar_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_policy.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_simt_tile_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_sparse_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_policy.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_sm70.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_tensor_op_wmma.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/scale_bias_tile_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/softmax_scale_bias_transform.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm/warp/tile_iterator_planar_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm_coord.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/gemm_coord.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/half.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/integer_subbyte.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/kernel_hardware_info.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/kernel_launch.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/layout.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/matrix.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/permute.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/pitch_linear.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/tensor.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm70.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm75.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm80.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/layout/vector.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/matrix.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/matrix_coord.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/matrix_shape.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/numeric_conversion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/numeric_size.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/numeric_types.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/pipeline/pipeline.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/pitch_linear_coord.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/platform/platform.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/predicate_vector.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/quaternion.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/real.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/device/reduce_split_k.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduce.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/thread/reduction_operators.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/reduction/threadblock_swizzle.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/relatively_equal.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/semaphore.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/subbyte_reference.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_coord.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_ref.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_ref_planar_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_view.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/tensor_view_planar_complex.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/tfloat32.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/thread/matrix.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/trace.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/pitch_linear_thread_map.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/thread/transpose.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/thread/unary_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/threadblock/vector_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/transform/warp/vector_fragment_iterator.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/uint128.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/version.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/wmma_array.h create mode 100755 lightllm-kernel/cutlass/include/cutlass/workspace.h create mode 100755 lightllm-kernel/include/cutlass_extensions/common.hpp create mode 100755 lightllm-kernel/include/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp create mode 100755 lightllm-kernel/include/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp create mode 100755 lightllm-kernel/include/reduce/sm70.cuh create mode 100644 lightllm-kernel/include/utils.h create mode 100644 lightllm-kernel/lightllm_kernel/ops/fusion.py create mode 100644 lightllm-kernel/lightllm_kernel/ops/gemm.py create mode 100644 lightllm-kernel/lightllm_kernel/ops/norm.py create mode 100644 lightllm-kernel/test/__init__.py create mode 100755 lightllm-kernel/test/fusion/add_norm_quant_test.py create mode 100644 lightllm-kernel/test/fusion/gelu_per_token_quant_test.py create mode 100755 lightllm-kernel/test/fusion/post_tp_norm_test.py create mode 100755 lightllm-kernel/test/fusion/pre_tp_norm_test.py create mode 100644 lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py create mode 100755 lightllm-kernel/test/norm/rmsnorm_test.py create mode 100755 lightllm-kernel/test/quant/quant_test.py create mode 100644 lightllm-kernel/test/utils.py diff --git a/lightllm-kernel/csrc/fusion/add_norm_quant.cu b/lightllm-kernel/csrc/fusion/add_norm_quant.cu new file mode 100755 index 000000000..3684dffc8 --- /dev/null +++ b/lightllm-kernel/csrc/fusion/add_norm_quant.cu @@ -0,0 +1,551 @@ +#include "ops_common.h" +#include "reduce/sm70.cuh" + +namespace lightllm { +namespace ops { + +using namespace lightllm; + +template +__global__ void device_add_norm_quant_bf16_general( + bf16_t* __restrict__ input, // Input tensor in BF16 format + const bf16_t* __restrict__ residual, // Residual tensor in BF16 format + const bf16_t* __restrict__ weight, // Weight tensor in BF16 format + fp8_e4m3_t* __restrict__ output, // Output tensor in FP8 format + fp32_t* __restrict__ scales, // Output scales for each group + const int64_t M, // Number of rows in the input tensor + const int32_t N, // Number of cols in the input tensor + const fp32_t eps // Epsilon value for numerical stability +) { + const fp32_t r_N = 1 / (fp32_t)N; // Reciprocal of N. + constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format + + const int32_t tid = threadIdx.x; + const int32_t bid = blockIdx.x; + + // Each block processes one row of the input tensor. + bf16_t* _input = input + bid * N; + const bf16_t* _residual = residual + bid * N; + fp8_e4m3_t* _output = output + bid * N; + + fp32_t* _scales; + _scales = scales + bid; + + // Shared memory workspace to store data. + extern __shared__ bf16_t workspace1[]; + + // Local registers to hold data. + bf16_t local_input; + bf16_t local_residual; + bf16_t local_w; + bf16_t local_output; + fp8_e4m3_t local_f8; + + + // Each thread computes a partial sum of squares. + fp32_t local_square_sum = 0.0f; + for (int32_t i = tid; i < N; i += TPB) { + local_input = _input[i]; + local_residual = _residual[i]; + + fp32_t x = cvt_bf16_f32(local_input); + fp32_t r = cvt_bf16_f32(local_residual); + local_input = cvt_f32_bf16(x + r); + fp32_t tmp = cvt_bf16_f32(local_input); + local_square_sum += tmp * tmp; + + _input[i] = local_input; + workspace1[i] = local_input; + } + + const fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32(local_square_sum); + + // Compute the mean square and then the inverse RMS normalization factor. + // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps). + const fp32_t mean_square = reduced_square_sum * r_N; + const fp32_t inv_norm = rsqrtf(mean_square + eps); + + // Normalize each element using the computed normalization factor. + fp32_t local_max = -FLT_MAX; + for (int32_t i = tid; i < N; i += TPB) { + local_input = workspace1[i]; + local_w = weight[i]; + + fp32_t x = cvt_bf16_f32(local_input); + fp32_t w = cvt_bf16_f32(local_w); + + fp32_t ret = x * inv_norm * w; + local_output = cvt_f32_bf16(ret); + fp32_t tmp = cvt_bf16_f32(local_output); + local_max = fmaxf(local_max, fabsf(tmp)); + + workspace1[i] = local_output; + } + + // Reduce the maximum value across the block + const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32(local_max); + + // Compute the scale factor with epsilon to avoid division by zero + constexpr fp32_t epsilon = 1e-7f; + const fp32_t scale = reduced_max / FP8_E4M3_MAX; + const fp32_t inv_scale = 1.0f / (scale + epsilon); + + for (int32_t i = tid; i < N; i += TPB) { + local_output = workspace1[i]; + + fp32_t tmp = cvt_bf16_f32(local_output); + fp32_t ret = tmp * inv_scale; + local_f8 = fp8_e4m3_t(ret); + + _output[i] = local_f8; + } + + if(tid == 0){ + *_scales = scale; + } +} + + + +template +__global__ void device_add_norm_quant_bf16_vpt( + bf16_t* __restrict__ input, // Input tensor in BF16 format + const bf16_t* __restrict__ residual, // Residual tensor in BF16 format + const bf16_t* __restrict__ weight, // Weight tensor in BF16 format + fp8_e4m3_t* __restrict__ output, // Output tensor in FP8 format + fp32_t* __restrict__ scales, // Output scales for each group + const int64_t M, // Number of rows in the input tensor + const int32_t N, // Number of cols in the input tensor + const fp32_t eps // Epsilon value for numerical stability +) { + constexpr int32_t VPT = 8; // Number of FP16 values processed per thread. + const fp32_t r_N = 1 / (fp32_t)N; // Reciprocal of N. + constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format + + const int32_t tid = threadIdx.x; + const int32_t bid = blockIdx.x; + + // Each block processes one row of the input tensor. + bf16_t* _input = input + bid * N; + const bf16_t* _residual = residual + bid * N; + fp8_e4m3_t* _output = output + bid * N; + + fp32_t* _scales; + _scales = scales + bid; + + // Shared memory workspace to store vectorized (half2) data. + // Note: since each bf16x2_t holds 2 half values, the workspace size is N/2. + extern __shared__ bf16x2_t workspace2[]; + + // Local registers to hold vectorized data. + bf16x2_t local_input[VPT / 2]; + bf16x2_t local_residual[VPT / 2]; + bf16x2_t local_w[VPT / 2]; + bf16x2_t local_output[VPT / 2]; + fp8x4_e4m3_t local_f8[VPT / 4]; + + + // Each thread computes a partial sum of squares. + fp32_t local_square_sum = 0.0f; + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + // Load VPT FP16 elements from global memory (_input) into local vector (local_input). + vec_copy(_input + i, local_input); + // Load VPT FP16 elements from global memory (_residual) into local vector (local_residual). + vec_copy(_residual + i, local_residual); + + # pragma unroll + for (int32_t j = 0; j < VPT / 2; j++) { + // Convert the bf16x2_t to fp32x2_t for computation. + fp32x2_t x = bf16x2_to_fp32x2(local_input[j]); + fp32x2_t r = bf16x2_to_fp32x2(local_residual[j]); + // Add the residual to the input. + local_input[j] = _float22bf162_rn(make_float2(x.x + r.x, x.y + r.y)); + + fp32x2_t tmp = bf16x2_to_fp32x2(local_input[j]); + local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y); + } + + // Store the loaded data into shared memory. + // Divide index by 2 because 'workspace' is an array of bf16x2_t. + vec_copy(local_input, _input + i); + vec_copy(local_input, workspace2 + (i >> 1)); + } + + const fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32(local_square_sum); + + // Compute the mean square and then the inverse RMS normalization factor. + // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps). + const fp32_t mean_square = reduced_square_sum * r_N; + const fp32_t inv_norm = rsqrtf(mean_square + eps); + + // Normalize each element using the computed normalization factor. + fp32_t local_max = -FLT_MAX; + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + // Load the previously stored vectorized data from shared memory. + vec_copy(workspace2 + (i >> 1), local_input); + // Load the corresponding weight values from global memory. + vec_copy(weight + i, local_w); + + #pragma unroll + for (int32_t j = 0; j < VPT / 2; j++) { + fp32x2_t x = bf16x2_to_fp32x2(local_input[j]); + fp32x2_t w = bf16x2_to_fp32x2(local_w[j]); + // Apply normalization: multiply by inv_norm and then scale by the weight. + fp32x2_t ret = make_float2( + x.x * inv_norm * w.x, + x.y * inv_norm * w.y + ); + local_output[j] = _float22bf162_rn(ret); + + + fp32x2_t tmp = bf16x2_to_fp32x2(local_output[j]); + fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y)); + local_max = fmaxf(local_max, max); + } + + vec_copy(local_output, workspace2 + (i >> 1)); + } + + // Reduce the maximum value across the block + const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32(local_max); + + // Compute the scale factor with epsilon to avoid division by zero + constexpr fp32_t epsilon = 1e-7f; + const fp32_t scale = reduced_max / FP8_E4M3_MAX; + const fp32_t inv_scale = 1.0f / (scale + epsilon); + + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + vec_copy(workspace2 + (i >> 1), local_output); + + #pragma unroll + for (int32_t j = 0; j < VPT/4; j++) { + fp32x2_t x = bf16x2_to_fp32x2(local_output[2 * j + 0]); + fp32x2_t y = bf16x2_to_fp32x2(local_output[2 * j + 1]); + fp32x4_t ret = make_float4( + x.x * inv_scale, + x.y * inv_scale, + y.x * inv_scale, + y.y * inv_scale + ); + local_f8[j] = fp8x4_e4m3_t(ret); + } + + vec_copy(local_f8, _output + i); + } + + if(tid == 0){ + *_scales = scale; + } +} + + +template +__global__ void device_add_norm_quant_bf16( + bf16_t* __restrict__ input, // Input tensor in BF16 format + const bf16_t* __restrict__ residual, // Residual tensor in BF16 format + const bf16_t* __restrict__ weight, // Weight tensor in BF16 format + fp8_e4m3_t* __restrict__ output, // Output tensor in FP8 format + fp32_t* __restrict__ scales, // Output scales for each group + const int64_t M, // Number of rows in the input tensor + const fp32_t eps // Epsilon value for numerical stability +) { + constexpr int32_t VPT = 8; // Number of FP16 values processed per thread. + constexpr fp32_t r_N = 1 / (fp32_t)N; // Reciprocal of N. + constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format + + static_assert(N % 2 == 0, "N must be even."); + static_assert(N % VPT == 0, "N must be a multiple of VPT."); + + const int32_t tid = threadIdx.x; + const int32_t bid = blockIdx.x; + + // Each block processes one row of the input tensor. + bf16_t* _input = input + bid * N; + const bf16_t* _residual = residual + bid * N; + fp8_e4m3_t* _output = output + bid * N; + + fp32_t* _scales; + _scales = scales + bid; + + // Shared memory workspace to store vectorized (half2) data. + // Note: since each bf16x2_t holds 2 half values, the workspace size is N/2. + __shared__ bf16x2_t workspace[N / 2]; + + // Local registers to hold vectorized data. + bf16x2_t local_input[VPT / 2]; + bf16x2_t local_residual[VPT / 2]; + bf16x2_t local_w[VPT / 2]; + bf16x2_t local_output[VPT / 2]; + fp8x4_e4m3_t local_f8[VPT / 4]; + + + // Each thread computes a partial sum of squares. + fp32_t local_square_sum = 0.0f; + # pragma unroll + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + // Load VPT FP16 elements from global memory (_input) into local vector (local_input). + vec_copy(_input + i, local_input); + // Load VPT FP16 elements from global memory (_residual) into local vector (local_residual). + vec_copy(_residual + i, local_residual); + + # pragma unroll + for (int32_t j = 0; j < VPT / 2; j++) { + // Convert the bf16x2_t to fp32x2_t for computation. + fp32x2_t x = bf16x2_to_fp32x2(local_input[j]); + fp32x2_t r = bf16x2_to_fp32x2(local_residual[j]); + // Add the residual to the input. + local_input[j] = _float22bf162_rn(make_float2(x.x + r.x, x.y + r.y)); + + fp32x2_t tmp = bf16x2_to_fp32x2(local_input[j]); + local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y); + } + + // Store the loaded data into shared memory. + // Divide index by 2 because 'workspace' is an array of bf16x2_t. + vec_copy(local_input, _input + i); + vec_copy(local_input, workspace + (i >> 1)); + } + + const fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32(local_square_sum); + + // Compute the mean square and then the inverse RMS normalization factor. + // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps). + const fp32_t mean_square = reduced_square_sum * r_N; + const fp32_t inv_norm = rsqrtf(mean_square + eps); + + // Normalize each element using the computed normalization factor. + fp32_t local_max = -FLT_MAX; + #pragma unroll + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + // Load the previously stored vectorized data from shared memory. + vec_copy(workspace + (i >> 1), local_input); + // Load the corresponding weight values from global memory. + vec_copy(weight + i, local_w); + + #pragma unroll + for (int32_t j = 0; j < VPT / 2; j++) { + fp32x2_t x = bf16x2_to_fp32x2(local_input[j]); + fp32x2_t w = bf16x2_to_fp32x2(local_w[j]); + // Apply normalization: multiply by inv_norm and then scale by the weight. + fp32x2_t ret = make_float2( + x.x * inv_norm * w.x, + x.y * inv_norm * w.y + ); + local_output[j] = _float22bf162_rn(ret); + + + fp32x2_t tmp = bf16x2_to_fp32x2(local_output[j]); + fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y)); + local_max = fmaxf(local_max, max); + } + + vec_copy(local_output, workspace + (i >> 1)); + } + + // Reduce the maximum value across the block + const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32(local_max); + + // Compute the scale factor with epsilon to avoid division by zero + constexpr fp32_t epsilon = 1e-7f; + const fp32_t scale = reduced_max / FP8_E4M3_MAX; + const fp32_t inv_scale = 1.0f / (scale + epsilon); + + #pragma unroll + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + vec_copy(workspace + (i >> 1), local_output); + + #pragma unroll + for (int32_t j = 0; j < VPT/4; j++) { + fp32x2_t x = bf16x2_to_fp32x2(local_output[2 * j + 0]); + fp32x2_t y = bf16x2_to_fp32x2(local_output[2 * j + 1]); + fp32x4_t ret = make_float4( + x.x * inv_scale, + x.y * inv_scale, + y.x * inv_scale, + y.y * inv_scale + ); + local_f8[j] = fp8x4_e4m3_t(ret); + } + + vec_copy(local_f8, _output + i); + } + + if(tid == 0){ + *_scales = scale; + } +} + +/** + * @brief Fused add norm quant + */ +std::tuple add_norm_quant_bf16_fp8( + Tensor& X, const Tensor &R, const Tensor &W, + const fp32_t eps +) { + TORCH_CHECK(X.ndimension() == 2, "Input tensor X must be 2D"); + TORCH_CHECK(R.ndimension() == 2, "Input tensor R must be 2D"); + TORCH_CHECK(W.ndimension() == 1, "Input tensor W must be 1D"); + + TORCH_CHECK(X.is_cuda(), "Input tensor X must be a CUDA tensor."); + TORCH_CHECK(R.is_cuda(), "Input tensor R must be a CUDA tensor."); + TORCH_CHECK(W.is_cuda(), "Input tensor W must be a CUDA tensor."); + + TORCH_CHECK(X.scalar_type() == c10::ScalarType::BFloat16, "Input tensor X must be BF16."); + TORCH_CHECK(R.scalar_type() == c10::ScalarType::BFloat16, "Input tensor R must be BF16."); + TORCH_CHECK(W.scalar_type() == c10::ScalarType::BFloat16, "Input tensor W must be BF16."); + + Tensor contiguous_X = X.is_contiguous() ? X : X.contiguous(); + Tensor contiguous_R = R.is_contiguous() ? R : R.contiguous(); + Tensor contiguous_W = W.is_contiguous() ? W : W.contiguous(); + + const uint32_t M = contiguous_X.size(0); + const uint32_t N = contiguous_X.size(1); + + Tensor output_q = torch::empty( + {M, N}, + torch::TensorOptions() + .dtype(torch::kFloat8_e4m3fn) + .device(contiguous_X.device()) + ); + Tensor scales = torch::empty( + {M, 1}, + torch::TensorOptions() + .dtype(torch::kFloat32) + .device(contiguous_X.device()) + ); + + const int32_t blocks = M; + + switch (N) { + case 16: + device_add_norm_quant_bf16<128, 16> + <<>>( + PTR(contiguous_X), + PTR(contiguous_R), + PTR(contiguous_W), + PTR(output_q), + PTR(scales), + M, + eps + ); + break; + case 32: + device_add_norm_quant_bf16<128, 32> + <<>>( + PTR(contiguous_X), + PTR(contiguous_R), + PTR(contiguous_W), + PTR(output_q), + PTR(scales), + M, + eps + ); + break; + case 64: + device_add_norm_quant_bf16<128, 64> + <<>>( + PTR(contiguous_X), + PTR(contiguous_R), + PTR(contiguous_W), + PTR(output_q), + PTR(scales), + M, + eps + ); + break; + case 512: + device_add_norm_quant_bf16<128, 512> + <<>>( + PTR(contiguous_X), + PTR(contiguous_R), + PTR(contiguous_W), + PTR(output_q), + PTR(scales), + M, + eps + ); + break; + case 1024: + device_add_norm_quant_bf16<128, 1024> + <<>>( + PTR(contiguous_X), + PTR(contiguous_R), + PTR(contiguous_W), + PTR(output_q), + PTR(scales), + M, + eps + ); + break; + case 3200: + device_add_norm_quant_bf16<128, 3200> + <<>>( + PTR(contiguous_X), + PTR(contiguous_R), + PTR(contiguous_W), + PTR(output_q), + PTR(scales), + M, + eps + ); + break; + case 4096: + device_add_norm_quant_bf16<128, 4096> + <<>>( + PTR(contiguous_X), + PTR(contiguous_R), + PTR(contiguous_W), + PTR(output_q), + PTR(scales), + M, + eps + ); + break; + case 12800: + device_add_norm_quant_bf16<256, 12800> + <<>>( + PTR(contiguous_X), + PTR(contiguous_R), + PTR(contiguous_W), + PTR(output_q), + PTR(scales), + M, + eps + ); + break; + default: { + static constexpr int32_t TPB = 128; + const int64_t shared_mem_size = N * sizeof(bf16_t); + if (N % 8 == 0) { + device_add_norm_quant_bf16_vpt + <<>>( + PTR(contiguous_X), + PTR(contiguous_R), + PTR(contiguous_W), + PTR(output_q), + PTR(scales), + M, + N, + eps + ); + } else { + device_add_norm_quant_bf16_general + <<>>( + PTR(contiguous_X), + PTR(contiguous_R), + PTR(contiguous_W), + PTR(output_q), + PTR(scales), + M, + N, + eps + ); + } + } + } + + return {output_q, scales}; +} + +} // namespace ops +} // namespace lightllm \ No newline at end of file diff --git a/lightllm-kernel/csrc/fusion/gelu_per_token_quant.cu b/lightllm-kernel/csrc/fusion/gelu_per_token_quant.cu new file mode 100755 index 000000000..b204e9737 --- /dev/null +++ b/lightllm-kernel/csrc/fusion/gelu_per_token_quant.cu @@ -0,0 +1,367 @@ +#include "ops_common.h" +#include "reduce/sm70.cuh" + + +namespace lightllm { +namespace ops { + +using namespace lightllm; + +template +__global__ void device_gelu_per_token_quant_bf16_to_fp8( + const bf16_t* __restrict__ input, // Input tensor in BF16 format + fp8_e4m3_t* __restrict__ output, // Output tensor in FP8 format + fp32_t* __restrict__ scales, // Output scales for each group + const int64_t M // Number of rows in the input tensor +) { + constexpr int32_t VPT = 8; + + static_assert(N % 2 == 0, "N must be even."); + static_assert(N % VPT == 0, "N must be a multiple of VPT."); + + const int32_t bid = blockIdx.x; + const int32_t tid = threadIdx.x; + constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format + const bf16x2_t one = _float22bf162_rn(make_float2(1.0f, 1.0f)); + const bf16x2_t one_2 = _float22bf162_rn(make_float2(0.5f, 0.5f)); + + const bf16_t* _input = input + bid * N; // Input pointer for the group + fp8_e4m3_t* _output = output + bid * N; // Output pointer for the group + + fp32_t* _scales; + _scales = scales + bid; + + // Local arrays for intermediate storage + fp8x4_e4m3_t local_f8[VPT / 4]; + bf16x2_t local_bf16[VPT / 2]; + + __shared__ bf16x2_t workspace[N / 2]; + + fp32_t local_max = -FLT_MAX; + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + vec_copy(_input + i, local_bf16); + //gelu + #pragma unroll + for(int32_t j = 0; j< VPT/2; j++){ + fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]); + tmp.x = erf(tmp.x * 0.7071067811f); + tmp.y = erf(tmp.y * 0.7071067811f); + bf16x2_t tan = _float22bf162_rn(tmp); + tan = __hadd2(tan, one); + tan = __hmul2(tan, local_bf16[j]); + tan = __hmul2(tan, one_2); + local_bf16[j] = tan; + } + + vec_copy(local_bf16, workspace + (i >> 1)); + + #pragma unroll + for(int32_t j = 0; j< VPT/2; j++){ + fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]); + fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y)); + local_max = fmaxf(local_max, max); + } + } + + // Reduce the maximum value across the thread group + const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32(local_max); + + // Compute the scale factor with epsilon to avoid division by zero + constexpr fp32_t epsilon = 1e-7f; + const fp32_t scale = reduced_max / FP8_E4M3_MAX; + const fp32_t inv_scale = 1.0f / (scale + epsilon); + + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + vec_copy(workspace + (i >> 1), local_bf16); + + #pragma unroll + for (int32_t j = 0; j < VPT/4; j++) { + fp32x2_t x = bf16x2_to_fp32x2(local_bf16[2 * j + 0]); + fp32x2_t y = bf16x2_to_fp32x2(local_bf16[2 * j + 1]); + fp32x4_t ret = make_float4( + x.x * inv_scale, + x.y * inv_scale, + y.x * inv_scale, + y.y * inv_scale + ); + local_f8[j] = fp8x4_e4m3_t(ret); + } + + vec_copy(local_f8, _output + i); + } + + if(tid == 0){ + *_scales = scale; + } +} + + +template +__global__ void gelu_per_token_quant_bf16_to_fp8_vpt( + const bf16_t* __restrict__ input, // Input tensor in BF16 format + fp8_e4m3_t* __restrict__ output, // Output tensor in FP8 format + fp32_t* __restrict__ scales, // Output scales for each group + const int64_t M, // Number of rows in the input tensor + const int32_t N +) { + constexpr int32_t VPT = 8; + + const int32_t bid = blockIdx.x; + const int32_t tid = threadIdx.x; + constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format + constexpr fp32_t sqrt_2_over_pi = 0.7978845608028654f; + constexpr fp32_t coeff = 0.044715f; + + const bf16_t* _input = input + bid * N; // Input pointer for the group + fp8_e4m3_t* _output = output + bid * N; // Output pointer for the group + + fp32_t* _scales; + _scales = scales + bid; + + // Local arrays for intermediate storage + fp8x4_e4m3_t local_f8[VPT / 4]; + bf16x2_t local_bf16[VPT / 2]; + + extern __shared__ bf16x2_t workspace[]; + + fp32_t local_max = -FLT_MAX; + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + vec_copy(_input + i, local_bf16); + + #pragma unroll + for(int32_t j = 0; j< VPT/2; j++){ + fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]); + + fp32_t tanh_arg1 = sqrt_2_over_pi * (tmp.x + coeff * tmp.x * tmp.x * tmp.x); + fp32_t tanh_arg2 = sqrt_2_over_pi * (tmp.y + coeff * tmp.y * tmp.y * tmp.y); + tmp.x = 0.5f * tmp.x * (1.0f + tanhf(tanh_arg1)); + tmp.y = 0.5f * tmp.y * (1.0f + tanhf(tanh_arg2)); + + local_bf16[j] = _float22bf162_rn(tmp); + } + + vec_copy(local_bf16, workspace + (i >> 1)); + + // Compute the max for the VPT elements. + #pragma unroll + for(int32_t j = 0; j< VPT/2; j++){ + fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]); + fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y)); + local_max = fmaxf(local_max, max); + } + } + + // Reduce the maximum value across the thread group + const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32(local_max); + + // Compute the scale factor with epsilon to avoid division by zero + constexpr fp32_t epsilon = 1e-7f; + const fp32_t scale = reduced_max / FP8_E4M3_MAX; + const fp32_t inv_scale = 1.0f / (scale + epsilon); + + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + vec_copy(workspace + (i >> 1), local_bf16); + + #pragma unroll + for (int32_t j = 0; j < VPT/4; j++) { + fp32x2_t x = bf16x2_to_fp32x2(local_bf16[2 * j + 0]); + fp32x2_t y = bf16x2_to_fp32x2(local_bf16[2 * j + 1]); + fp32x4_t ret = make_float4( + x.x * inv_scale, + x.y * inv_scale, + y.x * inv_scale, + y.y * inv_scale + ); + local_f8[j] = fp8x4_e4m3_t(ret); + } + + vec_copy(local_f8, _output + i); + } + + if(tid == 0){ + *_scales = scale; + } +} + + +template +__global__ void gelu_per_token_quant_bf16_to_fp8_general( + const bf16_t* __restrict__ input, // Input tensor in BF16 format + fp8_e4m3_t* __restrict__ output, // Output tensor in FP8 format + fp32_t* __restrict__ scales, // Output scales for each group + const int64_t M, // Number of rows in the input tensor + const int32_t N +) { + const int32_t bid = blockIdx.x; + const int32_t tid = threadIdx.x; + constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format + constexpr fp32_t sqrt_2_over_pi = 0.7978845608028654f; + constexpr fp32_t coeff = 0.044715f; + + const bf16_t* _input = input + bid * N; // Input pointer for the group + fp8_e4m3_t* _output = output + bid * N; // Output pointer for the group + + fp32_t* _scales; + _scales = scales + bid; + + extern __shared__ bf16_t workspace_[]; + + fp32_t local_max = -FLT_MAX; + + for (int32_t i = tid; i < N; i += TPB) { + fp32_t tmp = cvt_bf16_f32(_input[i]); + fp32_t tanh_arg = sqrt_2_over_pi * (tmp + coeff * tmp * tmp * tmp); + tmp = 0.5f * tmp * (1.0f + tanhf(tanh_arg)); + local_max = fmaxf(local_max, fabsf(tmp)); + workspace_[i] = cvt_f32_bf16(tmp); + } + + // Reduce the maximum value across the thread group + const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32(local_max); + + // Compute the scale factor with epsilon to avoid division by zero + constexpr fp32_t epsilon = 1e-7f; + const fp32_t scale = reduced_max / FP8_E4M3_MAX; + const fp32_t inv_scale = 1.0f / (scale + epsilon); + + for (int32_t i = tid; i < N; i += TPB) { + // Load the previously stored vectorized data from shared memory. + fp32_t x = cvt_bf16_f32(workspace_[i]); + // Apply normalization: multiply by inv_norm and then scale by the weight. + fp32_t ret = x * inv_scale; + _output[i] = fp8_e4m3_t(ret); + } + + if(tid == 0){ + *_scales = scale; + } +} + +void gelu_per_token_quant_bf16_fp8 ( + Tensor& output, + const Tensor& input, + Tensor& scales +) { + TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor"); + TORCH_CHECK(input.dim() == 2, "Input must be 2-dimensional"); + TORCH_CHECK(input.scalar_type() == c10::kBFloat16, "Input must be BF16 type"); + + Tensor contiguous_input = input.is_contiguous() ? input : input.contiguous(); + Tensor contiguous_scales = scales.is_contiguous() ? scales : scales.contiguous(); + + const int64_t M = input.size(0); + const int64_t N = input.size(1); + + const int32_t blocks = M; + + switch (N) { + case 16: + device_gelu_per_token_quant_bf16_to_fp8<64, 16> + <<>>( + PTR(contiguous_input), + PTR(output), + PTR(contiguous_scales), + M + ); + break; + case 32: + device_gelu_per_token_quant_bf16_to_fp8<64, 32> + <<>>( + PTR(contiguous_input), + PTR(output), + PTR(contiguous_scales), + M + ); + break; + case 64: + device_gelu_per_token_quant_bf16_to_fp8<64, 64> + <<>>( + PTR(contiguous_input), + PTR(output), + PTR(contiguous_scales), + M + ); + break; + case 512: + device_gelu_per_token_quant_bf16_to_fp8<64, 512> + <<>>( + PTR(contiguous_input), + PTR(output), + PTR(contiguous_scales), + M + ); + break; + + case 1024: + device_gelu_per_token_quant_bf16_to_fp8<128, 1024> + <<>>( + PTR(contiguous_input), + PTR(output), + PTR(contiguous_scales), + M + ); + break; + case 2048: + device_gelu_per_token_quant_bf16_to_fp8<128, 2048> + <<>>( + PTR(contiguous_input), + PTR(output), + PTR(contiguous_scales), + M + ); + break; + case 3200: + device_gelu_per_token_quant_bf16_to_fp8<128, 3200> + <<>>( + PTR(contiguous_input), + PTR(output), + PTR(contiguous_scales), + M + ); + break; + case 4096: + device_gelu_per_token_quant_bf16_to_fp8<256, 4096> + <<>>( + PTR(contiguous_input), + PTR(output), + PTR(contiguous_scales), + M + ); + break; + case 12800: + device_gelu_per_token_quant_bf16_to_fp8<256, 12800> + <<>>( + PTR(contiguous_input), + PTR(output), + PTR(contiguous_scales), + M + ); + break; + default: { + static constexpr int32_t TPB = 128; + int32_t sharedmem = N / 2 * sizeof(bf16x2_t); + if (N % 8 == 0) { + gelu_per_token_quant_bf16_to_fp8_vpt<128> + <<>>( + PTR(contiguous_input), + PTR(output), + PTR(contiguous_scales), + M, N + ); + } + else { + gelu_per_token_quant_bf16_to_fp8_general<128> + <<>>( + PTR(contiguous_input), + PTR(output), + PTR(contiguous_scales), + M, N + ); + } + } + } + return ; +} + +} // namespace ops +} // namespace lightllm \ No newline at end of file diff --git a/lightllm-kernel/csrc/fusion/post_tp_norm.cu b/lightllm-kernel/csrc/fusion/post_tp_norm.cu new file mode 100755 index 000000000..89f711405 --- /dev/null +++ b/lightllm-kernel/csrc/fusion/post_tp_norm.cu @@ -0,0 +1,364 @@ +#include "ops_common.h" +#include "reduce/sm70.cuh" + +namespace lightllm { +namespace ops { + +using namespace lightllm; + +/** + * @brief CUDA kernel to perform RMS normalization on an FP16 tensor. + * + * Each block processes one row of the input tensor. + * + * @tparam TPB Threads per block. + * @tparam N Number of FP16 elements in one row. + * + * @param X Pointer to the input tensor in global memory. [M, N] + * @param W Pointer to the weight tensor in global memory. [N] + * @param V Pointer to the variance tensor in global memory. [M] + * @param Y Pointer to the output tensor in global memory. [M, N] + * @param M Number of rows in the tensor. + * @param eps Epsilon for numerical stability. + */ +template +__global__ +void device_post_tp_norm_bf16_general( + bf16_t __restrict__ *X, // [M, N] Input tensor pointer. + const bf16_t __restrict__ *W, // [N] Weight tensor pointer. + const fp32_t __restrict__ *V, // [M] variance + bf16_t __restrict__ *Y, // [M, N] Output tensor pointer. + const int32_t M, // Number of rows. + const int32_t N, + const int32_t embed_dim, // if multiGPUs, embed_dim differs from N + const fp32_t eps // Epsilon for numerical stability. +) { + const fp32_t r_N = 1 / (fp32_t)embed_dim; // Reciprocal of N. + + const int32_t tid = threadIdx.x; + const int32_t bid = blockIdx.x; + + // Each block processes one row of the input tensor. + bf16_t* _X = X + bid * N; + bf16_t* _Y = Y + bid * N; + + // Local registers to hold data. + bf16_t local_x = cvt_f32_bf16(0.0f); + bf16_t local_w = cvt_f32_bf16(0.0f); + bf16_t local_y = cvt_f32_bf16(0.0f); + + fp32_t reduced_square_sum = V[bid]; + + // Compute the mean square and then the inverse RMS normalization factor. + // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps). + fp32_t mean_square = reduced_square_sum * r_N; + fp32_t inv_norm = rsqrtf(mean_square + eps); + + for (int32_t i = tid; i < N; i += TPB) { + local_x = _X[i]; + local_w = W[i]; + + fp32_t x = cvt_bf16_f32(local_x); + fp32_t w = cvt_bf16_f32(local_w); + + fp32_t ret = x * inv_norm * w; + local_y = cvt_f32_bf16(ret); + + _Y[i] = local_y; + } +} + + +/** + * @brief CUDA kernel to perform RMS normalization on an FP16 tensor. + * + * Each block processes one row of the input tensor. The kernel loads the + * data in a vectorized manner (using half2), computes the mean square, + * calculates the reciprocal square root (i.e. 1/sqrt(mean_square+eps)), + * and then normalizes the input row element‐wise while scaling with a weight. + * + * @tparam TPB Threads per block. + * @tparam N Number of FP16 elements in one row (must be a multiple of VPT). + * + * @param X Pointer to the input tensor in global memory. [M, N] + * @param W Pointer to the weight tensor in global memory. [N] + * @param V Pointer to the variance tensor in global memory. [M] + * @param Y Pointer to the output tensor in global memory. [M, N] + * @param M Number of rows in the tensor. + * @param eps Epsilon for numerical stability. + */ +template +__global__ +void device_post_tp_norm_bf16_vpt( + bf16_t __restrict__ *X, // [M, N] Input tensor pointer. + const bf16_t __restrict__ *W, // [N] Weight tensor pointer. + const fp32_t __restrict__ *V, // [M] variance + bf16_t __restrict__ *Y, // [M, N] Output tensor pointer. + const int32_t M, // Number of rows. + const int32_t N, + const int32_t embed_dim, // if multiGPUs, embed_dim differs from N + const fp32_t eps // Epsilon for numerical stability. +) { + constexpr int32_t VPT = 8; // Number of bf16 values processed per thread. + const fp32_t r_N = 1 / (fp32_t)embed_dim; // Reciprocal of N. + + const int32_t tid = threadIdx.x; + const int32_t bid = blockIdx.x; + + // Each block processes one row of the input tensor. + bf16_t* _X = X + bid * N; + bf16_t* _Y = Y + bid * N; + + // Local registers to hold vectorized data. + bf16x2_t local_x[VPT / 2]; + bf16x2_t local_w[VPT / 2]; + bf16x2_t local_y[VPT / 2]; + + fp32_t reduced_square_sum = V[bid]; + + // Compute the mean square and then the inverse RMS normalization factor. + // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps). + fp32_t mean_square = reduced_square_sum * r_N; + fp32_t inv_norm = rsqrtf(mean_square + eps); + + // Normalize each element using the computed normalization factor. + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + // Load the previously stored vectorized data from global memory. + vec_copy(_X + i, local_x); + // Load the corresponding weight values from global memory. + vec_copy(W + i, local_w); + + #pragma unroll + for (int32_t j = 0; j < VPT / 2; j++) { + fp32x2_t x = bf16x2_to_fp32x2(local_x[j]); + fp32x2_t w = bf16x2_to_fp32x2(local_w[j]); + // Apply normalization: multiply by inv_norm and then scale by the weight. + fp32x2_t ret = make_float2( + x.x * inv_norm * w.x, + x.y * inv_norm * w.y + ); + local_y[j] = _float22bf162_rn(ret); + } + // Write the normalized vectorized data back to global memory. + vec_copy(local_y, _Y + i); + } +} + +/** + * @brief CUDA kernel to perform RMS normalization on an FP16 tensor. + * + * Each block processes one row of the input tensor. The kernel loads the + * data in a vectorized manner (using half2), computes the mean square, + * calculates the reciprocal square root (i.e. 1/sqrt(mean_square+eps)), + * and then normalizes the input row element‐wise while scaling with a weight. + * + * @tparam TPB Threads per block. + * @tparam N Number of FP16 elements in one row (must be a multiple of VPT). + * + * @param X Pointer to the input tensor in global memory. [M, N] + * @param W Pointer to the weight tensor in global memory. [N] + * @param V Pointer to the variance tensor in global memory. [M] + * @param Y Pointer to the output tensor in global memory. [M, N] + * @param M Number of rows in the tensor. + * @param eps Epsilon for numerical stability. + */ +template +__global__ +void device_post_tp_norm_bf16( + bf16_t __restrict__ *X, // [M, N] Input tensor pointer. + const bf16_t __restrict__ *W, // [N] Weight tensor pointer. + const fp32_t __restrict__ *V, // [M] variance + bf16_t __restrict__ *Y, // [M, N] Output tensor pointer. + const int32_t M, // Number of rows. + const int32_t embed_dim, // if multiGPUs, embed_dim differs from N + const fp32_t eps // Epsilon for numerical stability. +) { + constexpr int32_t VPT = 8; // Number of bf16 values processed per thread. + const fp32_t r_N = 1 / (fp32_t)embed_dim; // Reciprocal of N. + + static_assert(N % 2 == 0, "N must be even."); + static_assert(N % VPT == 0, "N must be a multiple of VPT."); + + const int32_t tid = threadIdx.x; + const int32_t bid = blockIdx.x; + + // Each block processes one row of the input tensor. + bf16_t* _X = X + bid * N; + bf16_t* _Y = Y + bid * N; + + // Local registers to hold vectorized data. + bf16x2_t local_x[VPT / 2]; + bf16x2_t local_w[VPT / 2]; + bf16x2_t local_y[VPT / 2]; + + fp32_t reduced_square_sum = V[bid]; + + // Compute the mean square and then the inverse RMS normalization factor. + // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps). + fp32_t mean_square = reduced_square_sum * r_N; + fp32_t inv_norm = rsqrtf(mean_square + eps); + + // Normalize each element using the computed normalization factor. + # pragma unroll + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + // Load the previously stored vectorized data from global memory. + vec_copy(_X + i, local_x); + // Load the corresponding weight values from global memory. + vec_copy(W + i, local_w); + + #pragma unroll + for (int32_t j = 0; j < VPT / 2; j++) { + fp32x2_t x = bf16x2_to_fp32x2(local_x[j]); + fp32x2_t w = bf16x2_to_fp32x2(local_w[j]); + // Apply normalization: multiply by inv_norm and then scale by the weight. + fp32x2_t ret = make_float2( + x.x * inv_norm * w.x, + x.y * inv_norm * w.y + ); + local_y[j] = _float22bf162_rn(ret); + } + // Write the normalized vectorized data back to global memory. + vec_copy(local_y, _Y + i); + } +} + +/** + * @brief Launch RMSNorm kernel for FP16 tensors with aligned 16-element rows. + * + * This function validates the input tensors, ensures they are contiguous, + * selects the appropriate kernel configuration based on the row width N, + * and launches the CUDA kernel. + * + * @param X Input tensor with shape [M, N] (FP16, CUDA). + * @param W Weight tensor with shape [N] (FP16, CUDA). + * @param eps Epsilon for numerical stability. + * @return Output tensor with the same shape as X. + */ +Tensor post_tp_norm_bf16(Tensor &X, const Tensor &W, const Tensor &V, const int embed_dim, const fp32_t eps) { + TORCH_CHECK(X.ndimension() == 2 || X.ndimension() == 4, "Input tensor must be 2D or 4D"); + TORCH_CHECK(X.is_cuda(), "Input tensor must be a CUDA tensor."); + TORCH_CHECK(X.scalar_type() == c10::ScalarType::BFloat16, "Input tensor must be BF16."); + + Tensor contiguous_X = X.is_contiguous() ? X : X.contiguous(); + Tensor contiguous_W = W.is_contiguous() ? W : W.contiguous(); + Tensor contiguous_V = V.is_contiguous() ? V : V.contiguous(); + + Tensor input_tensor; + uint32_t M, N; + Tensor Y; + + if (X.ndimension() == 2) { + M = contiguous_X.size(0); + N = contiguous_X.size(1); + input_tensor = contiguous_X; + Y = torch::empty_like(input_tensor); + } else { + const uint32_t d0 = contiguous_X.size(0); + const uint32_t d1 = contiguous_X.size(1); + const uint32_t d2 = contiguous_X.size(2); + const uint32_t d3 = contiguous_X.size(3); + + M = d0 * d1; + N = d2 * d3; + input_tensor = contiguous_X.view({M, N}); + Y = torch::empty_like(input_tensor); + } + + // Each CUDA block processes one row. + const int32_t blocks = M; + + // Kernel dispatch based on the value of N. + switch (N) { + case 768: + device_post_tp_norm_bf16<128, 768> + <<>>( + PTR(input_tensor), PTR(contiguous_W), + PTR(contiguous_V), PTR(Y), + M, embed_dim, eps + ); + break; + case 1024: + device_post_tp_norm_bf16<128, 1024> + <<>>( + PTR(input_tensor), PTR(contiguous_W), + PTR(contiguous_V), PTR(Y), + M, embed_dim, eps + ); + break; + case 1664: + device_post_tp_norm_bf16<128, 1664> + <<>>( + PTR(input_tensor), PTR(contiguous_W), + PTR(contiguous_V), PTR(Y), + M, embed_dim, eps + ); + break; + case 2048: + device_post_tp_norm_bf16<128, 2048> + <<>>( + PTR(input_tensor), PTR(contiguous_W), + PTR(contiguous_V), PTR(Y), + M, embed_dim, eps + ); + break; + case 3200: + device_post_tp_norm_bf16<128, 3200> + <<>>( + PTR(input_tensor), PTR(contiguous_W), + PTR(contiguous_V), PTR(Y), + M, embed_dim, eps + ); + break; + case 4096: + device_post_tp_norm_bf16<256, 4096> + <<>>( + PTR(input_tensor), PTR(contiguous_W), + PTR(contiguous_V), PTR(Y), + M, embed_dim, eps + ); + break; + case 8192: + device_post_tp_norm_bf16<512, 8192> + <<>>( + PTR(input_tensor), PTR(contiguous_W), + PTR(contiguous_V), PTR(Y), + M, embed_dim, eps + ); + break; + case 10240: + device_post_tp_norm_bf16<512, 10240> + <<>>( + PTR(input_tensor), PTR(contiguous_W), + PTR(contiguous_V), PTR(Y), + M, embed_dim, eps + ); + break; + default: + static constexpr int32_t TPB = 256; + if (N % 8 == 0) { + device_post_tp_norm_bf16_vpt + <<>>( + PTR(input_tensor), PTR(contiguous_W), + PTR(contiguous_V), PTR(Y), + M, N, embed_dim, eps + ); + } else { + device_post_tp_norm_bf16_general + <<>>( + PTR(input_tensor), PTR(contiguous_W), + PTR(contiguous_V), PTR(Y), + M, N, embed_dim, eps + ); + } + } + + // need to reshape Y back to 4 dimens + if (X.ndimension() == 4) { + Y = Y.reshape(X.sizes()); + } + + return Y; +} + +} // namespace ops +} // namespace lightllm \ No newline at end of file diff --git a/lightllm-kernel/csrc/fusion/pre_tp_norm.cu b/lightllm-kernel/csrc/fusion/pre_tp_norm.cu new file mode 100755 index 000000000..966cf5ce7 --- /dev/null +++ b/lightllm-kernel/csrc/fusion/pre_tp_norm.cu @@ -0,0 +1,257 @@ +#include "ops_common.h" +#include "reduce/sm70.cuh" + +namespace lightllm { +namespace ops { + +using namespace lightllm; + +/** + * @tparam TPB Threads per block. + * @tparam N Number of bf16 elements in one row. + * + * @param X Pointer to the input tensor in global memory. [M, N] + * @param M Number of rows in the tensor. + */ +template +__global__ +void device_pre_tp_norm_bf16_general( + bf16_t __restrict__ *X, // [M, N] Input tensor pointer. + fp32_t __restrict__ *V, // [M] Variance tensor pointer. + const int32_t M, // Number of rows. + const int32_t N +) { + const int32_t tid = threadIdx.x; + const int32_t bid = blockIdx.x; + + // Each block processes one row of the input tensor. + bf16_t* _X = X + bid * N; + + bf16_t local_x = cvt_f32_bf16(0.0f); + fp32_t local_square_sum = 0.0f; + for (int32_t i = tid; i < N; i += TPB) { + local_x = _X[i]; + + fp32_t tmp = cvt_bf16_f32(local_x); + + local_square_sum += tmp * tmp; + } + + fp32_t block_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32(local_square_sum); + + if (tid == 0) { + V[bid] = block_square_sum; + } + +} + + + +/** + * @tparam TPB Threads per block. + * @tparam N Number of bf16 elements in one row (must be a multiple of VPT). + * + * @param X Pointer to the input tensor in global memory. [M, N] + * @param M Number of rows in the tensor. + */ +template +__global__ +void device_pre_tp_norm_bf16_vpt( + bf16_t __restrict__ *X, // [M, N] Input tensor pointer. + fp32_t __restrict__ *V, // [M] Variance tensor pointer. + const int32_t M, // Number of rows. + const int32_t N +) { + constexpr int32_t VPT = 8; // Number of bf16 values processed per thread. + + const int32_t tid = threadIdx.x; + const int32_t bid = blockIdx.x; + + // Each block processes one row of the input tensor. + bf16_t* _X = X + bid * N; + + // Local registers to hold vectorized data. + bf16x2_t local_x[VPT / 2]; + + // Each thread computes a partial sum of squares. + fp32_t local_square_sum = 0.0f; + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + // Load VPT bf16 elements from global memory (_X) into local vector (local_x). + vec_copy(_X + i, local_x); + + // Compute the sum of squares for the VPT elements. + #pragma unroll + for (int32_t j = 0; j < VPT / 2; j++) { + fp32x2_t tmp = bf16x2_to_fp32x2(local_x[j]); + local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y); + } + } + + // Reduce the partial sums across the block, block reduce sum will invoke __syncthread(); + V[bid] = lightllm::reduce::sm70::sync_block_reduce_sum_f32(local_square_sum); + +} + + +/** + * @tparam TPB Threads per block. + * @tparam N Number of bf16 elements in one row (must be a multiple of VPT). + * + * @param X Pointer to the input tensor in global memory. [M, N] + * @param M Number of rows in the tensor. + */ +template +__global__ +void device_pre_tp_norm_bf16( + bf16_t __restrict__ *X, // [M, N] Input tensor pointer. + fp32_t __restrict__ *V, // [M] Variance tensor pointer. + const int32_t M // Number of rows. +) { + constexpr int32_t VPT = 8; // Number of bf16 values processed per thread. + + static_assert(N % 2 == 0, "N must be even."); + static_assert(N % VPT == 0, "N must be a multiple of VPT."); + + const int32_t tid = threadIdx.x; + const int32_t bid = blockIdx.x; + + // Each block processes one row of the input tensor. + bf16_t* _X = X + bid * N; + + // Local registers to hold vectorized data. + bf16x2_t local_x[VPT / 2]; + + // Each thread computes a partial sum of squares. + fp32_t local_square_sum = 0.0f; + # pragma unroll + for (int32_t i = tid * VPT; i < N; i += TPB * VPT) { + // Load VPT bf16 elements from global memory (_X) into local vector (local_x). + vec_copy(_X + i, local_x); + + // Compute the sum of squares for the VPT elements. + #pragma unroll + for (int32_t j = 0; j < VPT / 2; j++) { + fp32x2_t tmp = bf16x2_to_fp32x2(local_x[j]); + local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y); + } + } + + // Reduce the partial sums across the block, block reduce sum will invoke __syncthread(); + V[bid] = lightllm::reduce::sm70::sync_block_reduce_sum_f32(local_square_sum); + +} + +/** + * @param X Input tensor with shape [M, N] (bf16, CUDA). + */ +Tensor pre_tp_norm_bf16(Tensor &X) { + TORCH_CHECK(X.ndimension() == 2 || X.ndimension() == 4, "Input tensor must be 2D or 4D"); + TORCH_CHECK(X.is_cuda(), "Input tensor must be a CUDA tensor."); + TORCH_CHECK(X.scalar_type() == c10::ScalarType::BFloat16, "Input tensor must be BF16."); + + Tensor contiguous_X = X.is_contiguous() ? X : X.contiguous(); + Tensor input_tensor; + uint32_t M, N; + Tensor V; + + if (X.ndimension() == 2) { + M = contiguous_X.size(0); + N = contiguous_X.size(1); + input_tensor = contiguous_X; + V = torch::empty( + {M}, + torch::TensorOptions() + .dtype(c10::ScalarType::Float) + .device(contiguous_X.device()) + ); + } else { + const uint32_t d0 = contiguous_X.size(0); + const uint32_t d1 = contiguous_X.size(1); + const uint32_t d2 = contiguous_X.size(2); + const uint32_t d3 = contiguous_X.size(3); + + M = d0 * d1; + N = d2 * d3; + input_tensor = contiguous_X.view({M, N}); + V = torch::empty( + {M}, + torch::TensorOptions() + .dtype(c10::ScalarType::Float) + .device(contiguous_X.device()) + ); + } + + + // Each CUDA block processes one row. + const int32_t blocks = M; + + // Kernel dispatch based on the value of N. + switch (N) { + case 768: + device_pre_tp_norm_bf16<128, 768> + <<>>( + PTR(input_tensor), PTR(V), M + ); + break; + case 1024: + device_pre_tp_norm_bf16<128, 1024> + <<>>( + PTR(input_tensor), PTR(V), M + ); + break; + case 1664: + device_pre_tp_norm_bf16<128, 1664> + <<>>( + PTR(input_tensor), PTR(V), M + ); + break; + case 2048: + device_pre_tp_norm_bf16<128, 2048> + <<>>( + PTR(input_tensor), PTR(V), M + ); + break; + case 3200: + device_pre_tp_norm_bf16<128, 3200> + <<>>( + PTR(input_tensor), PTR(V), M + ); + break; + case 4096: + device_pre_tp_norm_bf16<256, 4096> + <<>>( + PTR(input_tensor), PTR(V), M + ); + break; + case 8192: + device_pre_tp_norm_bf16<512, 8192> + <<>>( + PTR(input_tensor), PTR(V), M + ); + break; + case 10240: + device_pre_tp_norm_bf16<512, 10240> + <<>>( + PTR(input_tensor), PTR(V), M + ); + break; + default: { + static constexpr int32_t TPB = 256; + if (N % 8 == 0) { + device_pre_tp_norm_bf16_vpt + <<>>( + PTR(input_tensor), PTR(V), M, N + ); + } else { + device_pre_tp_norm_bf16_general + <<>>( + PTR(input_tensor), PTR(V), M, N + ); + } + } + } + return V; +} + +} // namespace ops +} // namespace lightllm \ No newline at end of file diff --git a/lightllm-kernel/csrc/gemm/Epilogues.md b/lightllm-kernel/csrc/gemm/Epilogues.md new file mode 100755 index 000000000..aae04157b --- /dev/null +++ b/lightllm-kernel/csrc/gemm/Epilogues.md @@ -0,0 +1,147 @@ +# CUTLASS Epilogues + +## Introduction +This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs. + +Currently, we only support symmetric quantization for weights, +and symmetric and asymmetric quantization for activations. +Both can be quantized per-tensor or per-channel (weights) / per-token (activations). + +There are 4 epilogues: +1. ScaledEpilogue: symmetric quantization for activations, no bias. +1. ScaledEpilogueBias: symmetric quantization for activations, supports bias. +1. ScaledEpilogueAzp: asymmetric per-tensor quantization for activations, supports bias. +1. ScaledEpilogueAzpPerToken: asymmetric per-token quantization for activations, supports bias. + +We do not have epilogues for asymmetric quantization of activations without bias in order to reduce final binary size. +Instead, if no bias is passed, the epilogue will use 0 as the bias. +That induces a redundant addition operation (and runtime check), but the performance impact is minor. + +## Underlying Linear Algebra + +More details available in the [Activation Quantization RFC](https://github.com/vllm-project/vllm/issues/3975). + +If $` \widehat X `$ is the quantized $` X `$, our matrices become the following + +```math +A = s_a (\widehat A - J_a z_a) +``` +```math +B = s_b \widehat B +``` +```math +D = A B + C +``` +```math +D = s_a s_b \widehat D + C +``` + +Here, D is the output of the GEMM, and C is the bias. +A is the activations and supports asymmetric quantization, +and B is the weights and only supports symmetric quantization. +$ s_a $ and $s_b$ are the scales for activations and weights, respectively. +$ z_a $ is the zero-point for activations, and $ J_a $ is the matrix of all ones with dimensions of A. +Additional epilogues would be required to support asymmetric quantization for weights. + +Expanding further, we can calculate $` \widehat D `$ as follows: + +```math +A B = s_a ( \widehat A - J_a z_a ) s_b \widehat B +``` +```math +A B = s_a s_b \left( \widehat A \widehat B - J_a z_a \widehat B \right) +``` +```math +\widehat D = \widehat A \widehat B - z_a J_a \widehat B +``` + +Note that $` \widehat A \widehat B `$ is the raw output of the GEMM, +and $` J_a \widehat B `$ is known ahead of time. +Each row of it is equal to $` \mathbf 1 \widehat B `$, which is a row-vector of column sums of $` \widehat B `$. + +## Epilogues + +### ScaledEpilogue +This epilogue computes the symmetric quantization for activations without bias, meaning $` C = 0 `$ and $` z_a = 0 `$. +The output of the GEMM is: + +```math +\widehat D = \widehat A \widehat B +``` +```math +D = s_a s_b \widehat D +``` +```math +D = s_a s_b \widehat A \widehat B +``` + +Epilogue parameters: +- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector). +- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector). + +### ScaledEpilogueBias +This epilogue computes the symmetric quantization for activations with bias, meaning $` z_a = 0 `$. +The output of the GEMM is: + +```math +\widehat D = \widehat A \widehat B +``` +```math +D = s_a s_b \widehat D + C +``` +```math +D = s_a s_b \widehat A \widehat B + C +``` + + +Epilogue parameters: +- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector). +- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector). +- `bias` is the bias, is always per-channel (row-vector). + +### ScaledEpilogueAzp +This epilogue computes the asymmetric per-tensor quantization for activations with bias. +The output of the GEMM is: + +```math +\widehat D = \widehat A \widehat B - z_a J_a \widehat B +``` +```math +D = s_a s_b \widehat D + C +``` +```math +D = s_a s_b \left( \widehat A \widehat B - z_a J_a \widehat B \right) + C +``` + +Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$. +That is precomputed and stored in `azp_with_adj` as a row-vector. + +Epilogue parameters: +- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector). + - Generally this will be per-tensor as the zero-points are per-tensor. +- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector). +- `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector). +- `bias` is the bias, is always per-channel (row-vector). + +To use these kernels efficiently, users must precompute the `azp_with_adj` term offline and pass it to the kernel. + +### ScaledEpilogueAzpPerToken +This epilogue computes the asymmetric per-token quantization for activations with bias. + +The output of the GEMM is the same as above, but the $` z_a `$ is a column-vector. +That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product of $` z_a `$ and $` \mathbf 1 \widehat B `$. + +Epilogue parameters: +- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector). + - Generally this will be per-token as the zero-points are per-token. +- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector). +- `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector). +- `azp` is the zero-point (`z_a`), is per-token (column-vector). +- `bias` is the bias, is always per-channel (row-vector). + +To use these kernels efficiently, users must precompute the `azp_adj` term offline and pass it to the kernel. + +The epilogue performs the following computation (where `Dq` is the raw quantized output of the GEMM): +``` +out = scale_a * scale_b * (Dq - azp_adj * azp) + bias +``` diff --git a/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu b/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu new file mode 100755 index 000000000..55d623755 --- /dev/null +++ b/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu @@ -0,0 +1,73 @@ +#include + +#if defined CUDA_VERSION && CUDA_VERSION >= 12000 + + #include "scaled_mm_c3x_sm90_fp8_dispatch.cuh" + #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" + +namespace lightllm { +namespace ops { + +using namespace lightllm; +/* + This file defines quantized GEMM operations using the CUTLASS 3.x API, for + NVIDIA GPUs with sm90a (Hopper) or later. +*/ + +template