Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ build:arm --host_action_env TF_NEED_CUDA="0"
build:arm --crosstool_top=@bazel_tools//tools/cpp:toolchain
build:arm --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
build:arm --copt="-DUSING_CUDA=0"
build:arm --copt="-D_GLIBCXX_USE_CXX11_ABI=0"
build:arm --copt="-D_GLIBCXX_USE_CXX11_ABI=1"
build:arm --define=xft_use_icx=false
build:arm --copt=-Wno-tautological-compare
build:arm --copt=-Wno-array-bounds # aios
Expand Down
9 changes: 9 additions & 0 deletions open_source/deps/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,12 @@ compile_pip_requirements(
requirements_txt = "requirements_lock_rocm.txt",
tags = ["manual"],
)

compile_pip_requirements(
name = "requirements_cpu_arm",
src = "requirements_cpu_arm.txt",
extra_args = PIP_EXTRA_ARGS,
extra_data = ["//open_source/deps:requirements_base.txt"],
requirements_txt = "requirements_lock_torch_arm.txt",
tags = ["manual"],
)
4 changes: 2 additions & 2 deletions open_source/deps/http.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ def http_deps():

http_archive(
name = "torch_2.3_py310_cpu_aarch64",
sha256 = "bef6996c27d8f6e92ea4e13a772d89611da0e103b48790de78131e308cf73076",
sha256 = "90832f4d118c566b8652a2196ac695fc1f14cf420db27b5a1b41c7eaaf2141e9",
urls = [
"https://download.pytorch.org/whl/cpu/torch-2.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=bef6996c27d8f6e92ea4e13a772d89611da0e103b48790de78131e308cf73076"
"https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl#sha256=90832f4d118c566b8652a2196ac695fc1f14cf420db27b5a1b41c7eaaf2141e9"
],
type = "zip",
build_file = clean_dep("//:BUILD.pytorch"),
Expand Down
3 changes: 2 additions & 1 deletion open_source/deps/requirements_cpu_arm.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
https://download.pytorch.org/whl/cpu/torch-2.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=bef6996c27d8f6e92ea4e13a772d89611da0e103b48790de78131e308cf73076 ; platform_machine == "aarch64"
https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl#sha256=90832f4d118c566b8652a2196ac695fc1f14cf420db27b5a1b41c7eaaf2141e9 ; platform_machine == "aarch64"
https://download.pytorch.org/whl/cpu/torchvision-0.21.0-cp310-cp310-linux_aarch64.whl#sha256=54815e0a56dde95cc6ec952577f67e0dc151eadd928e8d9f6a7f821d69a4a734 ; platform_machine == "aarch64"
-r ../../open_source/deps/requirements_base.txt
338 changes: 161 additions & 177 deletions open_source/deps/requirements_lock_torch_arm.txt

Large diffs are not rendered by default.

22 changes: 14 additions & 8 deletions rtp_llm/cpp/cache/CacheManager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -208,16 +208,18 @@ void CacheManager::initKvCache() {
}

void CacheManager::initKVCacheScale() {
bool is_cpu = (this->device_->getDeviceProperties().type == DeviceType::ArmCpu);
rtp_llm::MemoryType memory_type = is_cpu ? rtp_llm::MemoryType::MEMORY_CPU : rtp_llm::MemoryType::MEMORY_GPU;
if (config_.dtype == rtp_llm::DataType::TYPE_INT8) {
kv_cache_.k_scale =
std::make_unique<rtp_llm::Buffer>(rtp_llm::MemoryType::MEMORY_GPU,
std::make_unique<rtp_llm::Buffer>(memory_type,
rtp_llm::DataType::TYPE_FP32,
std::vector<size_t>{(size_t)config_.layer_num,
(size_t)config_.block_nums,
(size_t)config_.local_head_num_kv,
(size_t)config_.seq_size_per_block},
(int8_t*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2);
kv_cache_.v_scale = std::make_unique<rtp_llm::Buffer>(rtp_llm::MemoryType::MEMORY_GPU,
kv_cache_.v_scale = std::make_unique<rtp_llm::Buffer>(memory_type,
rtp_llm::DataType::TYPE_FP32,
std::vector<size_t>{(size_t)config_.layer_num,
(size_t)config_.block_nums,
Expand All @@ -230,15 +232,15 @@ void CacheManager::initKVCacheScale() {
#ifdef ENABLE_FP8
else if (config_.dtype == rtp_llm::DataType::TYPE_FP8_E4M3) {
kv_cache_.k_scale = std::make_unique<rtp_llm::Buffer>(
rtp_llm::MemoryType::MEMORY_GPU,
memory_type,
rtp_llm::DataType::TYPE_FP32,
std::vector<size_t>{(size_t)config_.layer_num,
(size_t)config_.block_nums,
(size_t)config_.local_head_num_kv,
(size_t)config_.seq_size_per_block},
(__nv_fp8_e4m3*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2);
kv_cache_.v_scale = std::make_unique<rtp_llm::Buffer>(
rtp_llm::MemoryType::MEMORY_GPU,
memory_type,
rtp_llm::DataType::TYPE_FP32,
std::vector<size_t>{(size_t)config_.layer_num,
(size_t)config_.block_nums,
Expand All @@ -251,14 +253,16 @@ void CacheManager::initKVCacheScale() {

void CacheManager::initKvCacheMla() {
RTP_LLM_LOG_INFO("init mla kv cache");
kv_cache_.k_blocks = std::make_unique<rtp_llm::Buffer>(rtp_llm::MemoryType::MEMORY_GPU,
bool is_cpu = (this->device_->getDeviceProperties().type == DeviceType::ArmCpu);
rtp_llm::MemoryType memory_type = is_cpu ? rtp_llm::MemoryType::MEMORY_CPU : rtp_llm::MemoryType::MEMORY_GPU;
kv_cache_.k_blocks = std::make_unique<rtp_llm::Buffer>(memory_type,
config_.dtype,
std::vector<size_t>{(size_t)config_.layer_num,
(size_t)config_.block_nums,
(size_t)config_.seq_size_per_block,
(size_t)config_.kv_lora_rank},
cache_base_ptr_);
kv_cache_.v_blocks = std::make_unique<rtp_llm::Buffer>(rtp_llm::MemoryType::MEMORY_GPU,
kv_cache_.v_blocks = std::make_unique<rtp_llm::Buffer>(memory_type,
config_.dtype,
std::vector<size_t>{(size_t)config_.layer_num,
(size_t)config_.block_nums,
Expand All @@ -269,15 +273,17 @@ void CacheManager::initKvCacheMla() {

void CacheManager::initKvCacheNormal() {
RTP_LLM_LOG_INFO("init normal kv cache");
kv_cache_.k_blocks = std::make_unique<rtp_llm::Buffer>(rtp_llm::MemoryType::MEMORY_GPU,
bool is_cpu = (this->device_->getDeviceProperties().type == DeviceType::ArmCpu);
rtp_llm::MemoryType memory_type = is_cpu ? rtp_llm::MemoryType::MEMORY_CPU : rtp_llm::MemoryType::MEMORY_GPU;
kv_cache_.k_blocks = std::make_unique<rtp_llm::Buffer>(memory_type,
config_.dtype,
std::vector<size_t>{(size_t)config_.layer_num,
(size_t)config_.block_nums,
(size_t)config_.local_head_num_kv,
(size_t)config_.seq_size_per_block,
(size_t)config_.size_per_head},
cache_base_ptr_);
kv_cache_.v_blocks = std::make_unique<rtp_llm::Buffer>(rtp_llm::MemoryType::MEMORY_GPU,
kv_cache_.v_blocks = std::make_unique<rtp_llm::Buffer>(memory_type,
config_.dtype,
std::vector<size_t>{(size_t)config_.layer_num,
(size_t)config_.block_nums,
Expand Down
6 changes: 3 additions & 3 deletions rtp_llm/cpp/devices/DeviceExport.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class DeviceExporter {
virtual torch::Tensor packInt8TensorToPackedInt4(torch::Tensor weight) = 0;
virtual torch::Tensor preprocessWeightsForMixedGemm(torch::Tensor weight, py::object quant_type, const std::string &arch) = 0;
virtual std::vector<torch::Tensor> symmetricQuantizeLastAxisOfBatchedMatrix(torch::Tensor weight, py::object quant_type, const std::string &arch) = 0;
virtual torch::Tensor preprocessWeightScale(torch::Tensor weight, torch::Tensor scale) = 0;
virtual torch::Tensor preprocessWeightScale(torch::Tensor weight, torch::Tensor scale, const std::string& key) = 0;

protected:
rtp_llm::DeviceInitParams device_params_;
Expand Down Expand Up @@ -55,8 +55,8 @@ class DeviceExporterImpl : public DeviceExporter {
const auto dtype = torch::python::detail::py_object_to_dtype(quant_type);
return Device::symmetricQuantizeLastAxisOfBatchedMatrix(weight, dtype, arch);
}
torch::Tensor preprocessWeightScale(torch::Tensor weight, torch::Tensor scale) {
return Device::preprocessWeightScale(weight, scale);
torch::Tensor preprocessWeightScale(torch::Tensor weight, torch::Tensor scale, const std::string& key) {
return Device::preprocessWeightScale(weight, scale, key);
}
};

Expand Down
2 changes: 1 addition & 1 deletion rtp_llm/cpp/devices/DeviceFactory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ void registerDeviceOps(py::module& m) {
py::arg("weight"),
py::arg("quant_type"),
py::arg("arch"))
.def("preprocess_weight_scale", &DeviceExporter::preprocessWeightScale, py::arg("weight"), py::arg("scale"));
.def("preprocess_weight_scale", &DeviceExporter::preprocessWeightScale, py::arg("weight"), py::arg("scale"), py::arg("key"));

m.def("get_device", &DeviceFactory::getDeviceExporter);
}
Expand Down
1 change: 1 addition & 0 deletions rtp_llm/cpp/devices/OpData.h
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,7 @@ struct MlaRotaryWriteKVCacheParams {
const AttentionLayerWeights& weights;
const AttentionConfigs& configs;
const QScheme qscheme;
bool is_prefill = false;
};

struct MlaAttentionModuleParams {
Expand Down
20 changes: 18 additions & 2 deletions rtp_llm/cpp/devices/arm_impl/ArmActOp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,30 @@ BufferPtr ArmCpuDevice::activation(const ActivationParams& params) {
gate = params.gate.value().get().data();
printBufferData(params.gate.value().get(), "ffn activation gate");
if (states->type() == DataType::TYPE_FP16) {
#pragma omp parallel for if (m > 1)
for (size_t i = 0; i < m; i++) {
for (size_t j = 0; j < n; j++) {
size_t j;
for (j = 0; j <= n - 8; j += 8) {
float16x8_t gate_vec = vld1q_f16((__fp16*)gate + i * n + j);
float16x8_t state_vec = vld1q_f16((__fp16*)states->dataWithOffset(i * n + j));
state_vec = vmulq_f16(state_vec, gate_vec);
vst1q_f16((__fp16*)states->dataWithOffset(i * n + j), state_vec);
}
for (; j < n; j++) {
*(__fp16*)(states->dataWithOffset(i * n + j)) *= ((__fp16*)gate)[i * n + j];
}
}
} else if (states->type() == DataType::TYPE_FP32) {
#pragma omp parallel for if (m > 1)
for (size_t i = 0; i < m; i++) {
for (size_t j = 0; j < n; j++) {
size_t j;
for (j = 0; j <= n - 4; j += 4) {
float32x4_t gate_vec = vld1q_f32((float*)gate + i * n + j);
float32x4_t state_vec = vld1q_f32((float*)states->dataWithOffset(i * n + j));
state_vec = vmulq_f32(state_vec, gate_vec);
vst1q_f32((float*)states->dataWithOffset(i * n + j), state_vec);
}
for (; j < n; j++) {
*(float*)(states->dataWithOffset(i * n + j)) *= ((float*)gate)[i * n + j];
}
}
Expand Down
Loading