alibaba · xiangze-arm · Jul 1, 2025 · Aug 19, 2025 · Aug 21, 2025
@@ -126,7 +126,7 @@ build:arm --host_action_env TF_NEED_CUDA="0"
 build:arm --crosstool_top=@bazel_tools//tools/cpp:toolchain
 build:arm --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
 build:arm --copt="-DUSING_CUDA=0"
-build:arm --copt="-D_GLIBCXX_USE_CXX11_ABI=0"
+build:arm --copt="-D_GLIBCXX_USE_CXX11_ABI=1"
 build:arm --define=xft_use_icx=false
 build:arm --copt=-Wno-tautological-compare
 build:arm --copt=-Wno-array-bounds # aios

@@ -53,3 +53,12 @@ compile_pip_requirements(
     requirements_txt = "requirements_lock_rocm.txt",
     tags = ["manual"],
 )
+
+compile_pip_requirements(
+    name = "requirements_cpu_arm",
+    src = "requirements_cpu_arm.txt",
+    extra_args = PIP_EXTRA_ARGS,
+    extra_data = ["//open_source/deps:requirements_base.txt"],
+    requirements_txt = "requirements_lock_torch_arm.txt",
+    tags = ["manual"],
+)
@@ -82,9 +82,9 @@ def http_deps():
 
     http_archive(
         name = "torch_2.3_py310_cpu_aarch64",
-        sha256 = "bef6996c27d8f6e92ea4e13a772d89611da0e103b48790de78131e308cf73076",
+        sha256 = "90832f4d118c566b8652a2196ac695fc1f14cf420db27b5a1b41c7eaaf2141e9",
         urls = [
-            "https://download.pytorch.org/whl/cpu/torch-2.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=bef6996c27d8f6e92ea4e13a772d89611da0e103b48790de78131e308cf73076"
+            "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl#sha256=90832f4d118c566b8652a2196ac695fc1f14cf420db27b5a1b41c7eaaf2141e9"
         ],
         type = "zip",
         build_file = clean_dep("//:BUILD.pytorch"),

@@ -1,2 +1,3 @@
-https://download.pytorch.org/whl/cpu/torch-2.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=bef6996c27d8f6e92ea4e13a772d89611da0e103b48790de78131e308cf73076 ; platform_machine == "aarch64"
+https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl#sha256=90832f4d118c566b8652a2196ac695fc1f14cf420db27b5a1b41c7eaaf2141e9 ; platform_machine == "aarch64"
+https://download.pytorch.org/whl/cpu/torchvision-0.21.0-cp310-cp310-linux_aarch64.whl#sha256=54815e0a56dde95cc6ec952577f67e0dc151eadd928e8d9f6a7f821d69a4a734 ; platform_machine == "aarch64"
 -r ../../open_source/deps/requirements_base.txt
@@ -208,16 +208,18 @@ void CacheManager::initKvCache() {
 }
 
 void CacheManager::initKVCacheScale() {
+    bool is_cpu = (this->device_->getDeviceProperties().type == DeviceType::ArmCpu);
+    rtp_llm::MemoryType memory_type = is_cpu ? rtp_llm::MemoryType::MEMORY_CPU : rtp_llm::MemoryType::MEMORY_GPU;
     if (config_.dtype == rtp_llm::DataType::TYPE_INT8) {
         kv_cache_.k_scale =
-            std::make_unique<rtp_llm::Buffer>(rtp_llm::MemoryType::MEMORY_GPU,
+            std::make_unique<rtp_llm::Buffer>(memory_type,
                                          rtp_llm::DataType::TYPE_FP32,
                                          std::vector<size_t>{(size_t)config_.layer_num,
                                                              (size_t)config_.block_nums,
                                                              (size_t)config_.local_head_num_kv,
                                                              (size_t)config_.seq_size_per_block},
                                          (int8_t*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2);
-        kv_cache_.v_scale = std::make_unique<rtp_llm::Buffer>(rtp_llm::MemoryType::MEMORY_GPU,
+        kv_cache_.v_scale = std::make_unique<rtp_llm::Buffer>(memory_type,
                                                          rtp_llm::DataType::TYPE_FP32,
                                                          std::vector<size_t>{(size_t)config_.layer_num,
                                                                              (size_t)config_.block_nums,
@@ -230,15 +232,15 @@ void CacheManager::initKVCacheScale() {
 #ifdef ENABLE_FP8
     else if (config_.dtype == rtp_llm::DataType::TYPE_FP8_E4M3) {
         kv_cache_.k_scale = std::make_unique<rtp_llm::Buffer>(
-                rtp_llm::MemoryType::MEMORY_GPU,
+                memory_type,
                 rtp_llm::DataType::TYPE_FP32,
                 std::vector<size_t>{(size_t)config_.layer_num,
                     (size_t)config_.block_nums,
                     (size_t)config_.local_head_num_kv,
                     (size_t)config_.seq_size_per_block},
                 (__nv_fp8_e4m3*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2);
         kv_cache_.v_scale = std::make_unique<rtp_llm::Buffer>(
-                rtp_llm::MemoryType::MEMORY_GPU,
+                memory_type,
                 rtp_llm::DataType::TYPE_FP32,
                 std::vector<size_t>{(size_t)config_.layer_num,
                     (size_t)config_.block_nums,
@@ -251,14 +253,16 @@ void CacheManager::initKVCacheScale() {
 
 void CacheManager::initKvCacheMla() {
     RTP_LLM_LOG_INFO("init mla kv cache");
-    kv_cache_.k_blocks = std::make_unique<rtp_llm::Buffer>(rtp_llm::MemoryType::MEMORY_GPU,
+    bool is_cpu = (this->device_->getDeviceProperties().type == DeviceType::ArmCpu);
+    rtp_llm::MemoryType memory_type = is_cpu ? rtp_llm::MemoryType::MEMORY_CPU : rtp_llm::MemoryType::MEMORY_GPU;
+    kv_cache_.k_blocks = std::make_unique<rtp_llm::Buffer>(memory_type,
                                                       config_.dtype,
                                                       std::vector<size_t>{(size_t)config_.layer_num,
                                                                           (size_t)config_.block_nums,
                                                                           (size_t)config_.seq_size_per_block,
                                                                           (size_t)config_.kv_lora_rank},
                                                       cache_base_ptr_);
-    kv_cache_.v_blocks = std::make_unique<rtp_llm::Buffer>(rtp_llm::MemoryType::MEMORY_GPU,
+    kv_cache_.v_blocks = std::make_unique<rtp_llm::Buffer>(memory_type,
                                                       config_.dtype,
                                                       std::vector<size_t>{(size_t)config_.layer_num,
                                                                           (size_t)config_.block_nums,                                                      
@@ -269,15 +273,17 @@ void CacheManager::initKvCacheMla() {
 
 void CacheManager::initKvCacheNormal() {
     RTP_LLM_LOG_INFO("init normal kv cache");
-    kv_cache_.k_blocks = std::make_unique<rtp_llm::Buffer>(rtp_llm::MemoryType::MEMORY_GPU,
+    bool is_cpu = (this->device_->getDeviceProperties().type == DeviceType::ArmCpu);
+    rtp_llm::MemoryType memory_type = is_cpu ? rtp_llm::MemoryType::MEMORY_CPU : rtp_llm::MemoryType::MEMORY_GPU;
+    kv_cache_.k_blocks = std::make_unique<rtp_llm::Buffer>(memory_type,
                                                       config_.dtype,
                                                       std::vector<size_t>{(size_t)config_.layer_num,
                                                                           (size_t)config_.block_nums,
                                                                           (size_t)config_.local_head_num_kv,
                                                                           (size_t)config_.seq_size_per_block,
                                                                           (size_t)config_.size_per_head},
                                                       cache_base_ptr_);
-    kv_cache_.v_blocks = std::make_unique<rtp_llm::Buffer>(rtp_llm::MemoryType::MEMORY_GPU,
+    kv_cache_.v_blocks = std::make_unique<rtp_llm::Buffer>(memory_type,
                                                       config_.dtype,
                                                       std::vector<size_t>{(size_t)config_.layer_num,
                                                                           (size_t)config_.block_nums,

@@ -26,7 +26,7 @@ class DeviceExporter {
     virtual torch::Tensor packInt8TensorToPackedInt4(torch::Tensor weight) = 0;
     virtual torch::Tensor preprocessWeightsForMixedGemm(torch::Tensor weight, py::object quant_type, const std::string &arch) = 0;
     virtual std::vector<torch::Tensor> symmetricQuantizeLastAxisOfBatchedMatrix(torch::Tensor weight, py::object quant_type, const std::string &arch) = 0;
-    virtual torch::Tensor preprocessWeightScale(torch::Tensor weight, torch::Tensor scale) = 0;
+    virtual torch::Tensor preprocessWeightScale(torch::Tensor weight, torch::Tensor scale, const std::string& key) = 0;
 
 protected:
     rtp_llm::DeviceInitParams device_params_;
@@ -55,8 +55,8 @@ class DeviceExporterImpl : public DeviceExporter {
         const auto dtype = torch::python::detail::py_object_to_dtype(quant_type);
         return Device::symmetricQuantizeLastAxisOfBatchedMatrix(weight, dtype, arch);
     }
-    torch::Tensor preprocessWeightScale(torch::Tensor weight, torch::Tensor scale) {
-        return Device::preprocessWeightScale(weight, scale);
+    torch::Tensor preprocessWeightScale(torch::Tensor weight, torch::Tensor scale, const std::string& key) {
+        return Device::preprocessWeightScale(weight, scale, key);
     }
 };
 

@@ -216,7 +216,7 @@ void registerDeviceOps(py::module& m) {
              py::arg("weight"),
              py::arg("quant_type"),
              py::arg("arch"))
-        .def("preprocess_weight_scale", &DeviceExporter::preprocessWeightScale, py::arg("weight"), py::arg("scale"));
+        .def("preprocess_weight_scale", &DeviceExporter::preprocessWeightScale, py::arg("weight"), py::arg("scale"), py::arg("key"));
 
     m.def("get_device", &DeviceFactory::getDeviceExporter);
 }

@@ -526,6 +526,7 @@ struct MlaRotaryWriteKVCacheParams {
     const AttentionLayerWeights&    weights;
     const AttentionConfigs&         configs;
     const QScheme                   qscheme;
+    bool                            is_prefill = false;
 };
 
 struct MlaAttentionModuleParams {

@@ -127,14 +127,30 @@ BufferPtr ArmCpuDevice::activation(const ActivationParams& params) {
         gate = params.gate.value().get().data();
         printBufferData(params.gate.value().get(), "ffn activation gate");
         if (states->type() == DataType::TYPE_FP16) {
+            #pragma omp parallel for if (m > 1)
             for (size_t i = 0; i < m; i++) {
-                for (size_t j = 0; j < n; j++) {
+                size_t j;
+                for (j = 0; j <= n - 8; j += 8) {
+                    float16x8_t gate_vec = vld1q_f16((__fp16*)gate + i * n + j);
+                    float16x8_t state_vec = vld1q_f16((__fp16*)states->dataWithOffset(i * n + j));
+                    state_vec = vmulq_f16(state_vec, gate_vec);
+                    vst1q_f16((__fp16*)states->dataWithOffset(i * n + j), state_vec);
+                }
+                for (; j < n; j++) {
                     *(__fp16*)(states->dataWithOffset(i * n + j)) *= ((__fp16*)gate)[i * n + j];
                 }
             }
         } else if (states->type() == DataType::TYPE_FP32) {
+            #pragma omp parallel for if (m > 1)
             for (size_t i = 0; i < m; i++) {
-                for (size_t j = 0; j < n; j++) {
+                size_t j;
+                for (j = 0; j <= n - 4; j += 4) {
+                    float32x4_t gate_vec = vld1q_f32((float*)gate + i * n + j);
+                    float32x4_t state_vec = vld1q_f32((float*)states->dataWithOffset(i * n + j));
+                    state_vec = vmulq_f32(state_vec, gate_vec);
+                    vst1q_f32((float*)states->dataWithOffset(i * n + j), state_vec);
+                }
+                for (; j < n; j++) {
                     *(float*)(states->dataWithOffset(i * n + j)) *= ((float*)gate)[i * n + j];
                 }
             }