alibaba
diff --git a/‎open_source/deps/requirements_lock_torch_gpu_cuda12.txt‎
Lines changed: 26 additions & 0 deletions b/‎open_source/deps/requirements_lock_torch_gpu_cuda12.txt‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎open_source/deps/requirements_torch_gpu_cuda12.txt‎
Lines changed: 2 additions & 1 deletion b/‎open_source/deps/requirements_torch_gpu_cuda12.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎rtp_llm/BUILD‎
Lines changed: 7 additions & 2 deletions b/‎rtp_llm/BUILD‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎rtp_llm/cpp/pybind/BUILD‎
Lines changed: 1 addition & 0 deletions b/‎rtp_llm/cpp/pybind/BUILD‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎rtp_llm/models/deepseek_v2.py‎
Lines changed: 7 additions & 1 deletion b/‎rtp_llm/models/deepseek_v2.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎rtp_llm/models_py/BUILD‎
Lines changed: 8 additions & 1 deletion b/‎rtp_llm/models_py/BUILD‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎rtp_llm/models_py/bindings/BUILD‎
Lines changed: 1 addition & 0 deletions b/‎rtp_llm/models_py/bindings/BUILD‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎rtp_llm/models_py/bindings/MlaParamsBase.h‎
Lines changed: 18 additions & 0 deletions b/‎rtp_llm/models_py/bindings/MlaParamsBase.h‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎rtp_llm/models_py/bindings/OpDefs.cc‎
Lines changed: 16 additions & 6 deletions b/‎rtp_llm/models_py/bindings/OpDefs.cc‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎rtp_llm/models_py/bindings/OpDefs.h‎
Lines changed: 9 additions & 0 deletions b/‎rtp_llm/models_py/bindings/OpDefs.h‎
Lines changed: 9 additions & 0 deletions
@@ -587,6 +587,9 @@ filelock==3.13.1 \
 flash-attn @ https://rtp-opensource.oss-cn-hangzhou.aliyuncs.com/rtp_llm/flash_attn-2.7.4.post1%2Bcu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl \
     --hash=sha256:bfdb0f290cc3d21d0810ba49a360ef91090f62cdc1345ec6900447e0d12d99af
     # via -r open_source/deps/requirements_torch_gpu_cuda12.txt
+flashinfer-python==0.2.5 \
+    --hash=sha256:990aa090ef781783e76b836696ece4efd23956f72b5696d622fc619a61162aef
+    # via -r open_source/deps/requirements_torch_gpu_cuda12.txt
 fonttools==4.53.1 \
     --hash=sha256:02569e9a810f9d11f4ae82c391ebc6fb5730d95a0657d24d754ed7763fb2d122 \
     --hash=sha256:0679a30b59d74b6242909945429dbddb08496935b82f91ea9bf6ad240ec23397 \
@@ -1549,6 +1552,27 @@ networkx==3.3 \
     --hash=sha256:0c127d8b2f4865f59ae9cb8aafcd60b5c70f3241ebd66f7defad7c4ab90126c9 \
     --hash=sha256:28575580c6ebdaf4505b22c6256a2b9de86b316dc63ba9e93abde3d78dfdbcf2
     # via torch
+ninja==1.13.0 \
+    --hash=sha256:11be2d22027bde06f14c343f01d31446747dbb51e72d00decca2eb99be911e2f \
+    --hash=sha256:1c97223cdda0417f414bf864cfb73b72d8777e57ebb279c5f6de368de0062988 \
+    --hash=sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9 \
+    --hash=sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630 \
+    --hash=sha256:3d7d7779d12cb20c6d054c61b702139fd23a7a964ec8f2c823f1ab1b084150db \
+    --hash=sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978 \
+    --hash=sha256:4be9c1b082d244b1ad7ef41eb8ab088aae8c109a9f3f0b3e56a252d3e00f42c1 \
+    --hash=sha256:5f8e1e8a1a30835eeb51db05cf5a67151ad37542f5a4af2a438e9490915e5b72 \
+    --hash=sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e \
+    --hash=sha256:6739d3352073341ad284246f81339a384eec091d9851a886dfa5b00a6d48b3e2 \
+    --hash=sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9 \
+    --hash=sha256:aa45b4037b313c2f698bc13306239b8b93b4680eb47e287773156ac9e9304714 \
+    --hash=sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200 \
+    --hash=sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c \
+    --hash=sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5 \
+    --hash=sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96 \
+    --hash=sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1 \
+    --hash=sha256:fb46acf6b93b8dd0322adc3a4945452a4e774b75b91293bafcc7b7f8e6517dfa \
+    --hash=sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e
+    # via flashinfer-python
 numba==0.60.0 \
     --hash=sha256:01ef4cd7d83abe087d644eaa3d95831b777aa21d441a23703d649e06b8e06b74 \
     --hash=sha256:0b983bd6ad82fe868493012487f34eae8bf7dd94654951404114f23c3466d34b \
@@ -1609,6 +1633,7 @@ numpy==1.24.1 \
     #   contourpy
     #   datasets
     #   decord
+    #   flashinfer-python
     #   gekko
     #   librosa
     #   matplotlib
@@ -3172,6 +3197,7 @@ torch @ https://mirrors.aliyun.com/pytorch-wheels/cu126/torch-2.6.0%2Bcu126-cp31
     #   autoawq-kernels
     #   bitsandbytes
     #   flash-attn
+    #   flashinfer-python
     #   peft
     #   sentence-transformers
     #   timm
 
@@ -7,4 +7,5 @@ https://mirrors.aliyun.com/pytorch-wheels/cu126/torchvision-0.21.0%2Bcu126-cp310
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
 tensorrt==10.3.0
 tensorrt-cu12-bindings==10.3.0
-tensorrt-cu12-libs==10.3.0
+tensorrt-cu12-libs==10.3.0
+flashinfer-python==0.2.5
@@ -22,6 +22,10 @@ tensorrt = [
     "tensorrt-cu12-libs",
 ]
 
+flashinfer = [
+    "flashinfer-python",
+]
+
 xft_dep = select({
     "@//:using_arm": [],
     "//:xft_use_icx": [
@@ -98,7 +102,7 @@ requirement([
     "concurrent_log_handler",
     "aiter",
     "fastsafetensors",
-] + tensorrt)
+] + tensorrt + flashinfer)
 
 filegroup(
     name = "cutlass_config",
@@ -210,7 +214,7 @@ py_library(
         "//rtp_llm/model_loader:loader",
         "//rtp_llm/models_py:models",
     ] + arch_dep + select({
-        "@//:using_cuda12": tensorrt,
+        "@//:using_cuda12": tensorrt + flashinfer,
         "//conditions:default": []
     }) + select({
         "@//:using_arm": [],
@@ -540,6 +544,7 @@ whl_reqs = [
     "bitsandbytes",
     "portalocker",
     "concurrent_log_handler",
+    "flashinfer-python==0.2.5",
 ] + whl_deps() + platform_deps() + xft_dep
 
 py_wheel(
 
@@ -47,6 +47,7 @@ cc_library(
     deps = [
         "//rtp_llm/cpp/utils:core_utils",
         "//rtp_llm/cpp/devices:devices_base",
+        "//rtp_llm/cpp/devices:device_utils",
         "@havenask//aios/autil:base64",
         "@havenask//aios/autil:zlib",
     ] + torch_deps() + select_py_bindings(),
 
@@ -27,6 +27,8 @@
 from rtp_llm.models.rotary_embedding.deepseek_rotary_embedding import (
     DeepseekV3YarnRotaryEmbedding,
 )
+from rtp_llm.models_py.model_desc.deepseek_v2 import DeepSeekV2Model
+from rtp_llm.models_py.model_desc.module_base import GptModelBase
 from rtp_llm.utils.model_weight import (
     CkptWeightInfo,
     W,
@@ -516,10 +518,14 @@ def _create_config(cls, ckpt_path: str):
             norm_type="rmsnorm",
             has_post_decoder_layernorm=True,
         )
-        config.activation_type = "gated-silu"
+        # config.activation_type = "gated-silu"
+        config.activation_type = "SiGLU"
         DeepSeekV2._from_hf(config, ckpt_path)
         return config
 
+    def _create_python_model(self) -> Optional[GptModelBase]:
+        self.py_model = DeepSeekV2Model(self.config, self.weight)
+
     @staticmethod
     def _from_hf(config: GptInitModelParameters, ckpt_path: str):
         config_path = os.path.join(ckpt_path, "config.json")
 
@@ -33,6 +33,10 @@ py_library(
     visibility = ["//visibility:public"],
 )
 
+# flashinfer-python is only available for CUDA12
+flashinfer = ["flashinfer-python"]
+requirement(flashinfer)
+
 py_library(
     name = "modules",
     srcs = glob([
@@ -43,7 +47,10 @@ py_library(
         ":utils",
         ":kernels",
         ":distributed",
-    ],
+    ] + select({
+        "@//:using_cuda12": flashinfer,
+        "//conditions:default": [],
+    }),
     visibility = ["//visibility:public"],
 )
 
 
@@ -7,6 +7,7 @@ cc_library(
         "OpDefs.h",
         "OpDefsUtils.h",
         "ParamsBase.h",
+        "MlaParamsBase.h",
     ],
     srcs = [
         "OpDefs.cc",
 
@@ -0,0 +1,18 @@
+#pragma once
+#include <memory>
+#include <torch/extension.h>
+#include "rtp_llm/models_py/bindings/OpDefs.h"
+
+namespace rtp_llm {
+
+class MlaParamsBase {
+public:
+    virtual ~MlaParamsBase() = default;
+    torch_ext::MlaParams fillParams(torch::Tensor t_prefix_lengths,
+                                    torch::Tensor t_sequence_lengths,
+                                    torch::Tensor t_input_lengths,
+                                    torch::Tensor t_kv_cache_block_id_host,
+                                    int           seq_size_per_block);
+};
+
+}  // namespace rtp_llm
@@ -3,9 +3,19 @@
 namespace torch_ext {
 
 void registerPyOpDefs(pybind11::module& m) {
+    pybind11::class_<MlaParams>(m, "MlaParams")
+        .def(pybind11::init<>())
+        .def_readonly("batch_indice", &MlaParams::batch_indice)
+        .def_readonly("positions", &MlaParams::positions)
+        .def_readonly("paged_kv_last_page_len", &MlaParams::paged_kv_last_page_len)
+        .def_readonly("kvlen", &MlaParams::kvlen)
+        .def_readonly("page_indice", &MlaParams::page_indice)
+        .def_readonly("page_indptr", &MlaParams::page_indptr)
+        .def_readonly("qo_indptr", &MlaParams::qo_indptr);
+
     pybind11::class_<KVCache>(m, "KVCache")
         .def(pybind11::init<>())
-        .def_readonly("k_cache_base", &KVCache::k_cache_base, "Key cache base tensor")
+        .def_readwrite("k_cache_base", &KVCache::k_cache_base, "Key cache base tensor")
         .def_readonly("v_cache_base", &KVCache::v_cache_base, "Value cache base tensor")
         .def_readonly("k_scale_base", &KVCache::k_scale_base, "Key cache scale tensor")
         .def_readonly("v_scale_base", &KVCache::v_scale_base, "Value cache scale tensor")
@@ -43,12 +53,12 @@ void registerPyOpDefs(pybind11::module& m) {
 
     pybind11::class_<PyAttentionInputs>(m, "PyAttentionInputs")
         .def(pybind11::init<>())
-        .def_readonly("is_prefill", &PyAttentionInputs::is_prefill)
-        .def_readonly("prefix_lengths", &PyAttentionInputs::prefix_lengths)
-        .def_readonly("sequence_lengths", &PyAttentionInputs::sequence_lengths)
-        .def_readonly("input_lengths", &PyAttentionInputs::input_lengths)
+        .def_readwrite("is_prefill", &PyAttentionInputs::is_prefill)
+        .def_readwrite("prefix_lengths", &PyAttentionInputs::prefix_lengths)
+        .def_readwrite("sequence_lengths", &PyAttentionInputs::sequence_lengths)
+        .def_readwrite("input_lengths", &PyAttentionInputs::input_lengths)
         .def_readonly("cu_seqlens", &PyAttentionInputs::cu_seqlens)
-        .def_readonly("kv_cache_block_id_host", &PyAttentionInputs::kv_cache_block_id_host)
+        .def_readwrite("kv_cache_block_id_host", &PyAttentionInputs::kv_cache_block_id_host)
         .def_readonly("kv_cache_block_id_device", &PyAttentionInputs::kv_cache_block_id_device)
         .def_readonly("dtype", &PyAttentionInputs::dtype)
         .def_readonly("kv_block_offset", &PyAttentionInputs::kv_block_offset)
 
@@ -7,6 +7,15 @@
 #include "rtp_llm/models_py/bindings/ParamsBase.h"
 #include "rtp_llm/cpp/utils/Logger.h"
 namespace torch_ext {
+struct MlaParams {
+    torch::Tensor batch_indice;
+    torch::Tensor positions;
+    torch::Tensor paged_kv_last_page_len;
+    torch::Tensor kvlen;
+    torch::Tensor page_indice;
+    torch::Tensor page_indptr;
+    torch::Tensor qo_indptr;
+};
 
 struct KVCache {
     torch::Tensor k_cache_base;