Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions open_source/deps/requirements_lock_torch_gpu_cuda12.txt
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,9 @@ filelock==3.13.1 \
flash-attn @ https://rtp-opensource.oss-cn-hangzhou.aliyuncs.com/rtp_llm/flash_attn-2.7.4.post1%2Bcu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl \
--hash=sha256:bfdb0f290cc3d21d0810ba49a360ef91090f62cdc1345ec6900447e0d12d99af
# via -r open_source/deps/requirements_torch_gpu_cuda12.txt
flashinfer-python==0.2.5 \
--hash=sha256:990aa090ef781783e76b836696ece4efd23956f72b5696d622fc619a61162aef
# via -r open_source/deps/requirements_torch_gpu_cuda12.txt
fonttools==4.53.1 \
--hash=sha256:02569e9a810f9d11f4ae82c391ebc6fb5730d95a0657d24d754ed7763fb2d122 \
--hash=sha256:0679a30b59d74b6242909945429dbddb08496935b82f91ea9bf6ad240ec23397 \
Expand Down Expand Up @@ -1549,6 +1552,27 @@ networkx==3.3 \
--hash=sha256:0c127d8b2f4865f59ae9cb8aafcd60b5c70f3241ebd66f7defad7c4ab90126c9 \
--hash=sha256:28575580c6ebdaf4505b22c6256a2b9de86b316dc63ba9e93abde3d78dfdbcf2
# via torch
ninja==1.13.0 \
--hash=sha256:11be2d22027bde06f14c343f01d31446747dbb51e72d00decca2eb99be911e2f \
--hash=sha256:1c97223cdda0417f414bf864cfb73b72d8777e57ebb279c5f6de368de0062988 \
--hash=sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9 \
--hash=sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630 \
--hash=sha256:3d7d7779d12cb20c6d054c61b702139fd23a7a964ec8f2c823f1ab1b084150db \
--hash=sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978 \
--hash=sha256:4be9c1b082d244b1ad7ef41eb8ab088aae8c109a9f3f0b3e56a252d3e00f42c1 \
--hash=sha256:5f8e1e8a1a30835eeb51db05cf5a67151ad37542f5a4af2a438e9490915e5b72 \
--hash=sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e \
--hash=sha256:6739d3352073341ad284246f81339a384eec091d9851a886dfa5b00a6d48b3e2 \
--hash=sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9 \
--hash=sha256:aa45b4037b313c2f698bc13306239b8b93b4680eb47e287773156ac9e9304714 \
--hash=sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200 \
--hash=sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c \
--hash=sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5 \
--hash=sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96 \
--hash=sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1 \
--hash=sha256:fb46acf6b93b8dd0322adc3a4945452a4e774b75b91293bafcc7b7f8e6517dfa \
--hash=sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e
# via flashinfer-python
numba==0.60.0 \
--hash=sha256:01ef4cd7d83abe087d644eaa3d95831b777aa21d441a23703d649e06b8e06b74 \
--hash=sha256:0b983bd6ad82fe868493012487f34eae8bf7dd94654951404114f23c3466d34b \
Expand Down Expand Up @@ -1609,6 +1633,7 @@ numpy==1.24.1 \
# contourpy
# datasets
# decord
# flashinfer-python
# gekko
# librosa
# matplotlib
Expand Down Expand Up @@ -3172,6 +3197,7 @@ torch @ https://mirrors.aliyun.com/pytorch-wheels/cu126/torch-2.6.0%2Bcu126-cp31
# autoawq-kernels
# bitsandbytes
# flash-attn
# flashinfer-python
# peft
# sentence-transformers
# timm
Expand Down
3 changes: 2 additions & 1 deletion open_source/deps/requirements_torch_gpu_cuda12.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ https://mirrors.aliyun.com/pytorch-wheels/cu126/torchvision-0.21.0%2Bcu126-cp310
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
tensorrt==10.3.0
tensorrt-cu12-bindings==10.3.0
tensorrt-cu12-libs==10.3.0
tensorrt-cu12-libs==10.3.0
flashinfer-python==0.2.5
9 changes: 7 additions & 2 deletions rtp_llm/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ tensorrt = [
"tensorrt-cu12-libs",
]

flashinfer = [
"flashinfer-python",
]

xft_dep = select({
"@//:using_arm": [],
"//:xft_use_icx": [
Expand Down Expand Up @@ -98,7 +102,7 @@ requirement([
"concurrent_log_handler",
"aiter",
"fastsafetensors",
] + tensorrt)
] + tensorrt + flashinfer)

filegroup(
name = "cutlass_config",
Expand Down Expand Up @@ -210,7 +214,7 @@ py_library(
"//rtp_llm/model_loader:loader",
"//rtp_llm/models_py:models",
] + arch_dep + select({
"@//:using_cuda12": tensorrt,
"@//:using_cuda12": tensorrt + flashinfer,
"//conditions:default": []
}) + select({
"@//:using_arm": [],
Expand Down Expand Up @@ -530,6 +534,7 @@ whl_reqs = [
"bitsandbytes",
"portalocker",
"concurrent_log_handler",
"flashinfer-python==0.2.5",
] + whl_deps() + platform_deps() + xft_dep

py_wheel(
Expand Down
1 change: 1 addition & 0 deletions rtp_llm/cpp/pybind/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ cc_library(
deps = [
"//rtp_llm/cpp/utils:core_utils",
"//rtp_llm/cpp/devices:devices_base",
"//rtp_llm/cpp/devices:device_utils",
"@havenask//aios/autil:base64",
"@havenask//aios/autil:zlib",
] + torch_deps() + select_py_bindings(),
Expand Down
8 changes: 7 additions & 1 deletion rtp_llm/models/deepseek_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
from rtp_llm.models.rotary_embedding.deepseek_rotary_embedding import (
DeepseekV3YarnRotaryEmbedding,
)
from rtp_llm.models_py.model_desc.deepseek_v2 import DeepSeekV2Model
from rtp_llm.models_py.model_desc.module_base import GptModelBase
from rtp_llm.utils.model_weight import (
CkptWeightInfo,
W,
Expand Down Expand Up @@ -516,10 +518,14 @@ def _create_config(cls, ckpt_path: str):
norm_type="rmsnorm",
has_post_decoder_layernorm=True,
)
config.activation_type = "gated-silu"
# config.activation_type = "gated-silu"
config.activation_type = "SiGLU"
DeepSeekV2._from_hf(config, ckpt_path)
return config

def _create_python_model(self) -> Optional[GptModelBase]:
self.py_model = DeepSeekV2Model(self.config, self.weight)

@staticmethod
def _from_hf(config: GptInitModelParameters, ckpt_path: str):
config_path = os.path.join(ckpt_path, "config.json")
Expand Down
9 changes: 8 additions & 1 deletion rtp_llm/models_py/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ py_library(
visibility = ["//visibility:public"],
)

# flashinfer-python is only available for CUDA12
flashinfer = ["flashinfer-python"]
requirement(flashinfer)

py_library(
name = "modules",
srcs = glob([
Expand All @@ -43,7 +47,10 @@ py_library(
":utils",
":kernels",
":distributed",
],
] + select({
"@//:using_cuda12": flashinfer,
"//conditions:default": [],
}),
visibility = ["//visibility:public"],
)

Expand Down
1 change: 1 addition & 0 deletions rtp_llm/models_py/bindings/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ cc_library(
"OpDefs.h",
"OpDefsUtils.h",
"ParamsBase.h",
"MlaParamsBase.h",
],
srcs = [
"OpDefs.cc",
Expand Down
18 changes: 18 additions & 0 deletions rtp_llm/models_py/bindings/MlaParamsBase.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once
#include <memory>
#include <torch/extension.h>
#include "rtp_llm/models_py/bindings/OpDefs.h"

namespace rtp_llm {

class MlaParamsBase {
public:
virtual ~MlaParamsBase() = default;
torch_ext::MlaParams fillParams(torch::Tensor t_prefix_lengths,
torch::Tensor t_sequence_lengths,
torch::Tensor t_input_lengths,
torch::Tensor t_kv_cache_block_id_host,
int seq_size_per_block);
};

} // namespace rtp_llm
22 changes: 16 additions & 6 deletions rtp_llm/models_py/bindings/OpDefs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,19 @@
namespace torch_ext {

void registerPyOpDefs(pybind11::module& m) {
pybind11::class_<MlaParams>(m, "MlaParams")
.def(pybind11::init<>())
.def_readonly("batch_indice", &MlaParams::batch_indice)
.def_readonly("positions", &MlaParams::positions)
.def_readonly("paged_kv_last_page_len", &MlaParams::paged_kv_last_page_len)
.def_readonly("kvlen", &MlaParams::kvlen)
.def_readonly("page_indice", &MlaParams::page_indice)
.def_readonly("page_indptr", &MlaParams::page_indptr)
.def_readonly("qo_indptr", &MlaParams::qo_indptr);

pybind11::class_<KVCache>(m, "KVCache")
.def(pybind11::init<>())
.def_readonly("k_cache_base", &KVCache::k_cache_base, "Key cache base tensor")
.def_readwrite("k_cache_base", &KVCache::k_cache_base, "Key cache base tensor")
.def_readonly("v_cache_base", &KVCache::v_cache_base, "Value cache base tensor")
.def_readonly("k_scale_base", &KVCache::k_scale_base, "Key cache scale tensor")
.def_readonly("v_scale_base", &KVCache::v_scale_base, "Value cache scale tensor")
Expand Down Expand Up @@ -43,12 +53,12 @@ void registerPyOpDefs(pybind11::module& m) {

pybind11::class_<PyAttentionInputs>(m, "PyAttentionInputs")
.def(pybind11::init<>())
.def_readonly("is_prefill", &PyAttentionInputs::is_prefill)
.def_readonly("prefix_lengths", &PyAttentionInputs::prefix_lengths)
.def_readonly("sequence_lengths", &PyAttentionInputs::sequence_lengths)
.def_readonly("input_lengths", &PyAttentionInputs::input_lengths)
.def_readwrite("is_prefill", &PyAttentionInputs::is_prefill)
.def_readwrite("prefix_lengths", &PyAttentionInputs::prefix_lengths)
.def_readwrite("sequence_lengths", &PyAttentionInputs::sequence_lengths)
.def_readwrite("input_lengths", &PyAttentionInputs::input_lengths)
.def_readonly("cu_seqlens", &PyAttentionInputs::cu_seqlens)
.def_readonly("kv_cache_block_id_host", &PyAttentionInputs::kv_cache_block_id_host)
.def_readwrite("kv_cache_block_id_host", &PyAttentionInputs::kv_cache_block_id_host)
.def_readonly("kv_cache_block_id_device", &PyAttentionInputs::kv_cache_block_id_device)
.def_readonly("dtype", &PyAttentionInputs::dtype)
.def_readonly("kv_block_offset", &PyAttentionInputs::kv_block_offset)
Expand Down
9 changes: 9 additions & 0 deletions rtp_llm/models_py/bindings/OpDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@
#include "rtp_llm/models_py/bindings/ParamsBase.h"
#include "rtp_llm/cpp/utils/Logger.h"
namespace torch_ext {
struct MlaParams {
torch::Tensor batch_indice;
torch::Tensor positions;
torch::Tensor paged_kv_last_page_len;
torch::Tensor kvlen;
torch::Tensor page_indice;
torch::Tensor page_indptr;
torch::Tensor qo_indptr;
};

struct KVCache {
torch::Tensor k_cache_base;
Expand Down
1 change: 1 addition & 0 deletions rtp_llm/models_py/bindings/common/Torch_ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ using StreamType = hipStream_t;
#include <cuda_fp8.h>
#include "rtp_llm/cpp/kernels/layernorm_kernels.h"
#include "rtp_llm/cpp/kernels/fused_qk_rmsnorm.h"
#include "rtp_llm/cpp/kernels/rmsnormKernels.h"
using bf16_type = nv_bfloat16;
using StreamType = cudaStream_t;
#define GET_CURRENT_STREAM() at::cuda::getCurrentCUDAStream(at::cuda::current_device()).stream()
Expand Down
121 changes: 121 additions & 0 deletions rtp_llm/models_py/bindings/cuda/FlashInferMlaParams.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#include "rtp_llm/models_py/bindings/cuda/FlashInferMlaParams.h"
#include "rtp_llm/cpp/utils/AssertUtils.h"
#include "rtp_llm/cpp/core/torch_utils/BufferTorchUtils.h"
#include <cstdint>
using namespace torch_ext;

namespace rtp_llm {

MlaParams FlashInferMlaAttnParams::fillParams(torch::Tensor t_prefix_lengths,
torch::Tensor t_sequence_lengths,
torch::Tensor t_input_lengths,
torch::Tensor t_kv_cache_block_id_host,
int seq_size_per_block) {
MlaParams params;
auto sequence_lengths_host = torchTensor2Buffer(t_sequence_lengths);
auto input_lengths_host = torchTensor2Buffer(t_input_lengths);

BufferPtr kv_cache_block_id_host;
if (t_kv_cache_block_id_host.size(0)) {
kv_cache_block_id_host = torchTensor2Buffer(t_kv_cache_block_id_host);
}

BufferPtr prefix_lengths_host;
if (t_prefix_lengths.size(0)) {
prefix_lengths_host = torchTensor2Buffer(t_prefix_lengths);
}

const int max_batch_blocks = kv_cache_block_id_host ? kv_cache_block_id_host->shape()[1] : -1;
const int batch_size = input_lengths_host->shape()[0];

int max_kv_len = 0;
int max_q_len = 0;
int accu_q_len = 0;
int offset = 0;
int total_page_idx = 0;

std::vector<int32_t> batch_indice;
std::vector<int32_t> positions;
std::vector<int32_t> paged_kv_last_page_len;
std::vector<int32_t> kvlen;
std::vector<int32_t> page_indice;
std::vector<int32_t> page_indptr = {0};
std::vector<int32_t> qo_indptr = {0};

auto input_lengths = input_lengths_host->data<int>();
auto prefix_lengths = prefix_lengths_host ? prefix_lengths_host->data<int>() : nullptr;
auto sequence_lengths = sequence_lengths_host ? sequence_lengths_host->data<int>() : nullptr;
auto kv_cache_block_id = kv_cache_block_id_host ? kv_cache_block_id_host->data<int>() : nullptr;

for (int i = 0; i < batch_size; i++) {
int seq_len = 0;
if (prefix_lengths) {
int input_length = input_lengths[i];
int prefix_length = prefix_lengths[i];

for (int j = 0; j < input_length; j++) {
batch_indice.push_back(i);
positions.push_back(j + prefix_length);
offset += 1;
}
seq_len = input_length + prefix_length;
max_q_len = max(max_q_len, input_length);
accu_q_len += input_length;
} else {
batch_indice.push_back(i);
positions.push_back(sequence_lengths[i]);
seq_len = sequence_lengths[i] + 1;
accu_q_len += 1;
}
paged_kv_last_page_len.push_back((seq_len - 1) % seq_size_per_block + 1);
kvlen.push_back(seq_len);
max_kv_len = max(seq_len, max_kv_len);

int page_num = (seq_len + seq_size_per_block - 1) / seq_size_per_block;

if (kv_cache_block_id) {
for (int j = 0; j < page_num; j++) {
auto page_idx = kv_cache_block_id[i * max_batch_blocks + j];
page_indice.push_back(page_idx);
total_page_idx++;
}
}
if (prefix_lengths) {
page_indptr.push_back(seq_len);
} else {
page_indptr.push_back(total_page_idx);
}
// page_indptr.push_back(total_page_idx);
qo_indptr.push_back(accu_q_len);
}
auto cuda_option = torch::dtype(torch::kInt).device(torch::DeviceType::CUDA).requires_grad(false);
params.batch_indice = torch::tensor(batch_indice, cuda_option);
params.page_indice = torch::tensor(page_indice, cuda_option);
params.page_indptr = torch::tensor(page_indptr, cuda_option);
params.paged_kv_last_page_len = torch::tensor(paged_kv_last_page_len, cuda_option);
params.qo_indptr = torch::tensor(qo_indptr, cuda_option);
params.kvlen = torch::tensor(kvlen, cuda_option);
params.positions = torch::tensor(positions, cuda_option);
return params;
}

void registerPyFlashInferMlaParams(pybind11::module& m) {
m.def(
"fill_mla_params",
[](torch::Tensor t_prefill_lengths,
torch::Tensor t_sequence_lengths,
torch::Tensor t_input_lengths,
torch::Tensor t_kv_cache_block_id_host,
int seq_size_per_block) {
auto params = std::make_shared<rtp_llm::FlashInferMlaAttnParams>();
return params->fillParams(
t_prefill_lengths, t_sequence_lengths, t_input_lengths, t_kv_cache_block_id_host, seq_size_per_block);
},
pybind11::arg("t_prefill_lengths"),
pybind11::arg("t_sequence_lengths"),
pybind11::arg("t_input_lengths"),
pybind11::arg("t_kv_cache_block_id_host"),
pybind11::arg("seq_size_per_block"));
}

} // namespace rtp_llm
23 changes: 23 additions & 0 deletions rtp_llm/models_py/bindings/cuda/FlashInferMlaParams.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#pragma once

#include <torch/extension.h>
#include <vector>
#include <memory>
#include "rtp_llm/models_py/bindings/OpDefs.h"
#include "rtp_llm/models_py/bindings/MlaParamsBase.h"

using namespace torch_ext;

namespace rtp_llm {

class FlashInferMlaAttnParams: public MlaParamsBase {
public:
MlaParams fillParams(torch::Tensor t_prefix_lengths,
torch::Tensor t_sequence_lengths,
torch::Tensor t_input_lengths,
torch::Tensor t_kv_cache_block_id_host,
int seq_size_per_block);
};
void registerPyFlashInferMlaParams(pybind11::module& m);

} // namespace rtp_llm
Loading