alibaba · LLLLKKKK · Oct 23, 2025 · Oct 20, 2025
@@ -587,6 +587,9 @@ filelock==3.13.1 \
 flash-attn @ https://rtp-opensource.oss-cn-hangzhou.aliyuncs.com/rtp_llm/flash_attn-2.7.4.post1%2Bcu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl \
     --hash=sha256:bfdb0f290cc3d21d0810ba49a360ef91090f62cdc1345ec6900447e0d12d99af
     # via -r open_source/deps/requirements_torch_gpu_cuda12.txt
+flashinfer-python==0.2.5 \
+    --hash=sha256:990aa090ef781783e76b836696ece4efd23956f72b5696d622fc619a61162aef
+    # via -r open_source/deps/requirements_torch_gpu_cuda12.txt
 fonttools==4.53.1 \
     --hash=sha256:02569e9a810f9d11f4ae82c391ebc6fb5730d95a0657d24d754ed7763fb2d122 \
     --hash=sha256:0679a30b59d74b6242909945429dbddb08496935b82f91ea9bf6ad240ec23397 \
@@ -1549,6 +1552,27 @@ networkx==3.3 \
     --hash=sha256:0c127d8b2f4865f59ae9cb8aafcd60b5c70f3241ebd66f7defad7c4ab90126c9 \
     --hash=sha256:28575580c6ebdaf4505b22c6256a2b9de86b316dc63ba9e93abde3d78dfdbcf2
     # via torch
+ninja==1.13.0 \
+    --hash=sha256:11be2d22027bde06f14c343f01d31446747dbb51e72d00decca2eb99be911e2f \
+    --hash=sha256:1c97223cdda0417f414bf864cfb73b72d8777e57ebb279c5f6de368de0062988 \
+    --hash=sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9 \
+    --hash=sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630 \
+    --hash=sha256:3d7d7779d12cb20c6d054c61b702139fd23a7a964ec8f2c823f1ab1b084150db \
+    --hash=sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978 \
+    --hash=sha256:4be9c1b082d244b1ad7ef41eb8ab088aae8c109a9f3f0b3e56a252d3e00f42c1 \
+    --hash=sha256:5f8e1e8a1a30835eeb51db05cf5a67151ad37542f5a4af2a438e9490915e5b72 \
+    --hash=sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e \
+    --hash=sha256:6739d3352073341ad284246f81339a384eec091d9851a886dfa5b00a6d48b3e2 \
+    --hash=sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9 \
+    --hash=sha256:aa45b4037b313c2f698bc13306239b8b93b4680eb47e287773156ac9e9304714 \
+    --hash=sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200 \
+    --hash=sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c \
+    --hash=sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5 \
+    --hash=sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96 \
+    --hash=sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1 \
+    --hash=sha256:fb46acf6b93b8dd0322adc3a4945452a4e774b75b91293bafcc7b7f8e6517dfa \
+    --hash=sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e
+    # via flashinfer-python
 numba==0.60.0 \
     --hash=sha256:01ef4cd7d83abe087d644eaa3d95831b777aa21d441a23703d649e06b8e06b74 \
     --hash=sha256:0b983bd6ad82fe868493012487f34eae8bf7dd94654951404114f23c3466d34b \
@@ -1609,6 +1633,7 @@ numpy==1.24.1 \
     #   contourpy
     #   datasets
     #   decord
+    #   flashinfer-python
     #   gekko
     #   librosa
     #   matplotlib
@@ -3172,6 +3197,7 @@ torch @ https://mirrors.aliyun.com/pytorch-wheels/cu126/torch-2.6.0%2Bcu126-cp31
     #   autoawq-kernels
     #   bitsandbytes
     #   flash-attn
+    #   flashinfer-python
     #   peft
     #   sentence-transformers
     #   timm

@@ -7,4 +7,5 @@ https://mirrors.aliyun.com/pytorch-wheels/cu126/torchvision-0.21.0%2Bcu126-cp310
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
 tensorrt==10.3.0
 tensorrt-cu12-bindings==10.3.0
-tensorrt-cu12-libs==10.3.0
+tensorrt-cu12-libs==10.3.0
+flashinfer-python==0.2.5
@@ -22,6 +22,10 @@ tensorrt = [
     "tensorrt-cu12-libs",
 ]
 
+flashinfer = [
+    "flashinfer-python",
+]
+
 xft_dep = select({
     "@//:using_arm": [],
     "//:xft_use_icx": [
@@ -98,7 +102,7 @@ requirement([
     "concurrent_log_handler",
     "aiter",
     "fastsafetensors",
-] + tensorrt)
+] + tensorrt + flashinfer)
 
 filegroup(
     name = "cutlass_config",
@@ -210,7 +214,7 @@ py_library(
         "//rtp_llm/model_loader:loader",
         "//rtp_llm/models_py:models",
     ] + arch_dep + select({
-        "@//:using_cuda12": tensorrt,
+        "@//:using_cuda12": tensorrt + flashinfer,
         "//conditions:default": []
     }) + select({
         "@//:using_arm": [],
@@ -530,6 +534,7 @@ whl_reqs = [
     "bitsandbytes",
     "portalocker",
     "concurrent_log_handler",
+    "flashinfer-python==0.2.5",
 ] + whl_deps() + platform_deps() + xft_dep
 
 py_wheel(

@@ -47,6 +47,7 @@ cc_library(
     deps = [
         "//rtp_llm/cpp/utils:core_utils",
         "//rtp_llm/cpp/devices:devices_base",
+        "//rtp_llm/cpp/devices:device_utils",
         "@havenask//aios/autil:base64",
         "@havenask//aios/autil:zlib",
     ] + torch_deps() + select_py_bindings(),

@@ -27,6 +27,8 @@
 from rtp_llm.models.rotary_embedding.deepseek_rotary_embedding import (
     DeepseekV3YarnRotaryEmbedding,
 )
+from rtp_llm.models_py.model_desc.deepseek_v2 import DeepSeekV2Model
+from rtp_llm.models_py.model_desc.module_base import GptModelBase
 from rtp_llm.utils.model_weight import (
     CkptWeightInfo,
     W,
@@ -516,10 +518,14 @@ def _create_config(cls, ckpt_path: str):
             norm_type="rmsnorm",
             has_post_decoder_layernorm=True,
         )
-        config.activation_type = "gated-silu"
+        # config.activation_type = "gated-silu"
+        config.activation_type = "SiGLU"
         DeepSeekV2._from_hf(config, ckpt_path)
         return config
 
+    def _create_python_model(self) -> Optional[GptModelBase]:
+        self.py_model = DeepSeekV2Model(self.config, self.weight)
+
     @staticmethod
     def _from_hf(config: GptInitModelParameters, ckpt_path: str):
         config_path = os.path.join(ckpt_path, "config.json")

@@ -33,6 +33,10 @@ py_library(
     visibility = ["//visibility:public"],
 )
 
+# flashinfer-python is only available for CUDA12
+flashinfer = ["flashinfer-python"]
+requirement(flashinfer)
+
 py_library(
     name = "modules",
     srcs = glob([
@@ -43,7 +47,10 @@ py_library(
         ":utils",
         ":kernels",
         ":distributed",
-    ],
+    ] + select({
+        "@//:using_cuda12": flashinfer,
+        "//conditions:default": [],
+    }),
     visibility = ["//visibility:public"],
 )
 

@@ -7,6 +7,7 @@ cc_library(
         "OpDefs.h",
         "OpDefsUtils.h",
         "ParamsBase.h",
+        "MlaParamsBase.h",
     ],
     srcs = [
         "OpDefs.cc",

@@ -0,0 +1,18 @@
+#pragma once
+#include <memory>
+#include <torch/extension.h>
+#include "rtp_llm/models_py/bindings/OpDefs.h"
+
+namespace rtp_llm {
+
+class MlaParamsBase {
+public:
+    virtual ~MlaParamsBase() = default;
+    torch_ext::MlaParams fillParams(torch::Tensor t_prefix_lengths,
+                                    torch::Tensor t_sequence_lengths,
+                                    torch::Tensor t_input_lengths,
+                                    torch::Tensor t_kv_cache_block_id_host,
+                                    int           seq_size_per_block);
+};
+
+}  // namespace rtp_llm
@@ -3,9 +3,19 @@
 namespace torch_ext {
 
 void registerPyOpDefs(pybind11::module& m) {
+    pybind11::class_<MlaParams>(m, "MlaParams")
+        .def(pybind11::init<>())
+        .def_readonly("batch_indice", &MlaParams::batch_indice)
+        .def_readonly("positions", &MlaParams::positions)
+        .def_readonly("paged_kv_last_page_len", &MlaParams::paged_kv_last_page_len)
+        .def_readonly("kvlen", &MlaParams::kvlen)
+        .def_readonly("page_indice", &MlaParams::page_indice)
+        .def_readonly("page_indptr", &MlaParams::page_indptr)
+        .def_readonly("qo_indptr", &MlaParams::qo_indptr);
+
     pybind11::class_<KVCache>(m, "KVCache")
         .def(pybind11::init<>())
-        .def_readonly("k_cache_base", &KVCache::k_cache_base, "Key cache base tensor")
+        .def_readwrite("k_cache_base", &KVCache::k_cache_base, "Key cache base tensor")
         .def_readonly("v_cache_base", &KVCache::v_cache_base, "Value cache base tensor")
         .def_readonly("k_scale_base", &KVCache::k_scale_base, "Key cache scale tensor")
         .def_readonly("v_scale_base", &KVCache::v_scale_base, "Value cache scale tensor")
@@ -43,12 +53,12 @@ void registerPyOpDefs(pybind11::module& m) {
 
     pybind11::class_<PyAttentionInputs>(m, "PyAttentionInputs")
         .def(pybind11::init<>())
-        .def_readonly("is_prefill", &PyAttentionInputs::is_prefill)
-        .def_readonly("prefix_lengths", &PyAttentionInputs::prefix_lengths)
-        .def_readonly("sequence_lengths", &PyAttentionInputs::sequence_lengths)
-        .def_readonly("input_lengths", &PyAttentionInputs::input_lengths)
+        .def_readwrite("is_prefill", &PyAttentionInputs::is_prefill)
+        .def_readwrite("prefix_lengths", &PyAttentionInputs::prefix_lengths)
+        .def_readwrite("sequence_lengths", &PyAttentionInputs::sequence_lengths)
+        .def_readwrite("input_lengths", &PyAttentionInputs::input_lengths)
         .def_readonly("cu_seqlens", &PyAttentionInputs::cu_seqlens)
-        .def_readonly("kv_cache_block_id_host", &PyAttentionInputs::kv_cache_block_id_host)
+        .def_readwrite("kv_cache_block_id_host", &PyAttentionInputs::kv_cache_block_id_host)
         .def_readonly("kv_cache_block_id_device", &PyAttentionInputs::kv_cache_block_id_device)
         .def_readonly("dtype", &PyAttentionInputs::dtype)
         .def_readonly("kv_block_offset", &PyAttentionInputs::kv_block_offset)

@@ -7,6 +7,15 @@
 #include "rtp_llm/models_py/bindings/ParamsBase.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 namespace torch_ext {
+struct MlaParams {
+    torch::Tensor batch_indice;
+    torch::Tensor positions;
+    torch::Tensor paged_kv_last_page_len;
+    torch::Tensor kvlen;
+    torch::Tensor page_indice;
+    torch::Tensor page_indptr;
+    torch::Tensor qo_indptr;
+};
 
 struct KVCache {
     torch::Tensor k_cache_base;

@@ -23,6 +23,7 @@ using StreamType = hipStream_t;
 #include <cuda_fp8.h>
 #include "rtp_llm/cpp/kernels/layernorm_kernels.h"
 #include "rtp_llm/cpp/kernels/fused_qk_rmsnorm.h"
+#include "rtp_llm/cpp/kernels/rmsnormKernels.h"
 using bf16_type  = nv_bfloat16;
 using StreamType = cudaStream_t;
 #define GET_CURRENT_STREAM() at::cuda::getCurrentCUDAStream(at::cuda::current_device()).stream()

@@ -0,0 +1,121 @@
+#include "rtp_llm/models_py/bindings/cuda/FlashInferMlaParams.h"
+#include "rtp_llm/cpp/utils/AssertUtils.h"
+#include "rtp_llm/cpp/core/torch_utils/BufferTorchUtils.h"
+#include <cstdint>
+using namespace torch_ext;
+
+namespace rtp_llm {
+
+MlaParams FlashInferMlaAttnParams::fillParams(torch::Tensor t_prefix_lengths,
+                                              torch::Tensor t_sequence_lengths,
+                                              torch::Tensor t_input_lengths,
+                                              torch::Tensor t_kv_cache_block_id_host,
+                                              int           seq_size_per_block) {
+    MlaParams params;
+    auto      sequence_lengths_host = torchTensor2Buffer(t_sequence_lengths);
+    auto      input_lengths_host    = torchTensor2Buffer(t_input_lengths);
+
+    BufferPtr kv_cache_block_id_host;
+    if (t_kv_cache_block_id_host.size(0)) {
+        kv_cache_block_id_host = torchTensor2Buffer(t_kv_cache_block_id_host);
+    }
+
+    BufferPtr prefix_lengths_host;
+    if (t_prefix_lengths.size(0)) {
+        prefix_lengths_host = torchTensor2Buffer(t_prefix_lengths);
+    }
+
+    const int max_batch_blocks = kv_cache_block_id_host ? kv_cache_block_id_host->shape()[1] : -1;
+    const int batch_size       = input_lengths_host->shape()[0];
+
+    int max_kv_len     = 0;
+    int max_q_len      = 0;
+    int accu_q_len     = 0;
+    int offset         = 0;
+    int total_page_idx = 0;
+
+    std::vector<int32_t> batch_indice;
+    std::vector<int32_t> positions;
+    std::vector<int32_t> paged_kv_last_page_len;
+    std::vector<int32_t> kvlen;
+    std::vector<int32_t> page_indice;
+    std::vector<int32_t> page_indptr = {0};
+    std::vector<int32_t> qo_indptr   = {0};
+
+    auto input_lengths     = input_lengths_host->data<int>();
+    auto prefix_lengths    = prefix_lengths_host ? prefix_lengths_host->data<int>() : nullptr;
+    auto sequence_lengths  = sequence_lengths_host ? sequence_lengths_host->data<int>() : nullptr;
+    auto kv_cache_block_id = kv_cache_block_id_host ? kv_cache_block_id_host->data<int>() : nullptr;
+
+    for (int i = 0; i < batch_size; i++) {
+        int seq_len = 0;
+        if (prefix_lengths) {
+            int input_length  = input_lengths[i];
+            int prefix_length = prefix_lengths[i];
+
+            for (int j = 0; j < input_length; j++) {
+                batch_indice.push_back(i);
+                positions.push_back(j + prefix_length);
+                offset += 1;
+            }
+            seq_len   = input_length + prefix_length;
+            max_q_len = max(max_q_len, input_length);
+            accu_q_len += input_length;
+        } else {
+            batch_indice.push_back(i);
+            positions.push_back(sequence_lengths[i]);
+            seq_len = sequence_lengths[i] + 1;
+            accu_q_len += 1;
+        }
+        paged_kv_last_page_len.push_back((seq_len - 1) % seq_size_per_block + 1);
+        kvlen.push_back(seq_len);
+        max_kv_len = max(seq_len, max_kv_len);
+
+        int page_num = (seq_len + seq_size_per_block - 1) / seq_size_per_block;
+
+        if (kv_cache_block_id) {
+            for (int j = 0; j < page_num; j++) {
+                auto page_idx = kv_cache_block_id[i * max_batch_blocks + j];
+                page_indice.push_back(page_idx);
+                total_page_idx++;
+            }
+        }
+        if (prefix_lengths) {
+            page_indptr.push_back(seq_len);
+        } else {
+            page_indptr.push_back(total_page_idx);
+        }
+        // page_indptr.push_back(total_page_idx);
+        qo_indptr.push_back(accu_q_len);
+    }
+    auto cuda_option              = torch::dtype(torch::kInt).device(torch::DeviceType::CUDA).requires_grad(false);
+    params.batch_indice           = torch::tensor(batch_indice, cuda_option);
+    params.page_indice            = torch::tensor(page_indice, cuda_option);
+    params.page_indptr            = torch::tensor(page_indptr, cuda_option);
+    params.paged_kv_last_page_len = torch::tensor(paged_kv_last_page_len, cuda_option);
+    params.qo_indptr              = torch::tensor(qo_indptr, cuda_option);
+    params.kvlen                  = torch::tensor(kvlen, cuda_option);
+    params.positions              = torch::tensor(positions, cuda_option);
+    return params;
+}
+
+void registerPyFlashInferMlaParams(pybind11::module& m) {
+    m.def(
+        "fill_mla_params",
+        [](torch::Tensor t_prefill_lengths,
+           torch::Tensor t_sequence_lengths,
+           torch::Tensor t_input_lengths,
+           torch::Tensor t_kv_cache_block_id_host,
+           int           seq_size_per_block) {
+            auto params = std::make_shared<rtp_llm::FlashInferMlaAttnParams>();
+            return params->fillParams(
+                t_prefill_lengths, t_sequence_lengths, t_input_lengths, t_kv_cache_block_id_host, seq_size_per_block);
+        },
+        pybind11::arg("t_prefill_lengths"),
+        pybind11::arg("t_sequence_lengths"),
+        pybind11::arg("t_input_lengths"),
+        pybind11::arg("t_kv_cache_block_id_host"),
+        pybind11::arg("seq_size_per_block"));
+}
+
+}  // namespace rtp_llm
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/extension.h>
+#include <vector>
+#include <memory>
+#include "rtp_llm/models_py/bindings/OpDefs.h"
+#include "rtp_llm/models_py/bindings/MlaParamsBase.h"
+
+using namespace torch_ext;
+
+namespace rtp_llm {
+
+class FlashInferMlaAttnParams: public MlaParamsBase {
+public:
+    MlaParams fillParams(torch::Tensor t_prefix_lengths,
+                         torch::Tensor t_sequence_lengths,
+                         torch::Tensor t_input_lengths,
+                         torch::Tensor t_kv_cache_block_id_host,
+                         int           seq_size_per_block);
+};
+void registerPyFlashInferMlaParams(pybind11::module& m);
+
+}  // namespace rtp_llm