Skip to content

vLLM: update vLLM XPU to 0.8.3 version #13118

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Apr 30, 2025
181 changes: 44 additions & 137 deletions docker/llm/serving/xpu/docker/1ccl_for_multi_arc.patch
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
From d345631f78a2f33ff1ddd7d9908b288eb0afaf46 Mon Sep 17 00:00:00 2001
From: Huajun Li <[email protected]>
Date: Fri, 24 May 2024 09:47:26 +0800
Subject: [PATCH 1/3] allreduce optimization with LL256 for Arc770 dGPU
From dfe1851b59df6859829b447353307b7c916ccee0 Mon Sep 17 00:00:00 2001
From: junhansh <[email protected]>
Date: Mon, 28 Apr 2025 23:33:11 +0800
Subject: [PATCH] oneccl for Arc770 V2025.0.0.6.7

allreduce optimization with LL256 for Arc770 dGPU

To enable this feature, please set env var:
export CCL_DG2_ALLREDUCE=1
Expand All @@ -12,6 +14,15 @@ Build:
3. cmake .. -GNinja -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCMAKE_CXX_FLAGS="-fsycl" -DCOMPUTE_BACKEND=dpcpp -DCMAKE_BUILD_TYPE=MinSizeRel
4. ninja
5. ls -al src/libccl*

Changes:
optimize req_workgroup calculate

Revert "optimize req_workgroup calculate" for hang issue

This reverts commit 20bfd0e0a37f93dfb8bb9c092cd5a0b35e868bfa.

fix_fdset_buffer_overflow_issue
---
src/CMakeLists.txt | 2 +
src/coll/coll.cpp | 30 +-
Expand All @@ -20,9 +31,9 @@ Build:
src/common/env/env.cpp | 1 +
src/common/env/env.hpp | 1 +
src/common/env/vars.hpp | 1 +
src/dg2/dg2_allreduce.cpp | 642 +++++++++++++++++++++++++++++++
src/dg2/dg2_allreduce.cpp | 640 +++++++++++++++++++++++++++++++
src/dg2/dg2_allreduce.hpp | 13 +
9 files changed, 693 insertions(+), 3 deletions(-)
9 files changed, 691 insertions(+), 3 deletions(-)
create mode 100644 src/dg2/dg2_allreduce.cpp
create mode 100644 src/dg2/dg2_allreduce.hpp

Expand Down Expand Up @@ -163,10 +174,10 @@ index 73dcf77..84ab518 100644
constexpr const char* CCL_MIN_CHUNK_SIZE = "CCL_MIN_CHUNK_SIZE";
diff --git a/src/dg2/dg2_allreduce.cpp b/src/dg2/dg2_allreduce.cpp
new file mode 100644
index 0000000..15ace74
index 0000000..73e114b
--- /dev/null
+++ b/src/dg2/dg2_allreduce.cpp
@@ -0,0 +1,642 @@
@@ -0,0 +1,640 @@
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/un.h>
Expand All @@ -178,7 +189,7 @@ index 0000000..15ace74
+#include <drm/drm.h>
+
+#include <mpi.h>
+
+#include <poll.h>
+#include <vector>
+#include <sstream>
+#include <iostream>
Expand Down Expand Up @@ -315,14 +326,17 @@ index 0000000..15ace74
+
+static void *thread_func(void *arg)
+{
+ fd_set fds;
+ int count = 0;
+ char sock_path[64];
+ int peer_buf_fd = 0;
+ int rank = *(int *)arg;
+
+ snprintf(sock_path, sizeof(sock_path), "%s-%d_%d", SOCK_PATH, rank, 0xa770);
+ int srv_fd = srv_sock(sock_path);
+ if (srv_fd < 0) {
+ perror("srv_sock failed");
+ return nullptr;
+ }
+
+ //std::cout << "-----> srv_fd of " << sock_path << " : " << srv_fd << "\n";
+
Expand All @@ -331,35 +345,30 @@ index 0000000..15ace74
+ ze_context_handle_t ze_context = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(sycl_context);
+ ze_device_handle_t ze_device = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(sycl_device);
+
+ FD_ZERO(&fds);
+ FD_SET(srv_fd, &fds);
+ struct pollfd pfd = {
+ .fd = srv_fd,
+ .events = POLL_IN,
+ .revents = 0
+ };
+ while (++count < world_size) {
+ int ret = select(srv_fd + 1, &fds, NULL, NULL, NULL);
+ switch (ret) {
+ case 1:
+ {
+ int peer_rank;
+ void *peer_buf;
+
+ int conn_fd = accept(srv_fd, NULL, 0);
+ ccl::utils::recvmsg_fd(conn_fd, &peer_buf_fd, &peer_rank, sizeof(peer_rank));
+
+ ze_ipc_mem_handle_t ipc_handle_peer_buf = get_handle_from_fd(peer_buf_fd);
+ zeMemOpenIpcHandle(ze_context, ze_device, ipc_handle_peer_buf, ZE_IPC_MEMORY_FLAG_BIAS_CACHED /* cached allocation */, &peer_buf);
+ int ret = poll(&pfd, 1, -1);
+ if (ret <= 0) {
+ std::cerr << "poll failed: " << strerror(errno) << "\n";
+ break;
+ }
+
+ peer_bufs[peer_rank] = peer_buf;
+ //printf("<------------- rank: %d, peer_bufs[%d]: %p\n", world_rank, peer_rank, peer_bufs[peer_rank]);
+ if (pfd.revents & POLL_IN) {
+ int peer_rank;
+ void *peer_buf = nullptr;
+
+ if (conn_fd > 0) close(conn_fd);
+ int conn_fd = accept(srv_fd, NULL, 0);
+ ccl::utils::recvmsg_fd(conn_fd, &peer_buf_fd, &peer_rank, sizeof(peer_rank));
+ ze_ipc_mem_handle_t ipc_handle_peer_buf = get_handle_from_fd(peer_buf_fd);
+ zeMemOpenIpcHandle(ze_context, ze_device, ipc_handle_peer_buf, ZE_IPC_MEMORY_FLAG_BIAS_CACHED, &peer_buf);
+
+ break;
+ }
+ case 0:
+ case -1:
+ std::cout << "srv_fd select() failed" << "\n";
+ break;
+ default:
+ break;
+ peer_bufs[peer_rank] = peer_buf;
+ //printf("<------------- rank: %d, peer_bufs[%d]: %p\n", world_rank, peer_rank, peer_bufs[peer_rank]);
+ if (conn_fd > 0) close(conn_fd);
+ }
+ }
+
Expand Down Expand Up @@ -831,105 +840,3 @@ index 0000000..0506445
--
2.34.1


From 20bfd0e0a37f93dfb8bb9c092cd5a0b35e868bfa Mon Sep 17 00:00:00 2001
From: Huajun Li <[email protected]>
Date: Fri, 7 Mar 2025 15:15:35 +0800
Subject: [PATCH 2/3] optimize req_workgroup calculate

---
src/dg2/dg2_allreduce.cpp | 25 ++-----------------------
1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/src/dg2/dg2_allreduce.cpp b/src/dg2/dg2_allreduce.cpp
index 15ace74..83270ae 100644
--- a/src/dg2/dg2_allreduce.cpp
+++ b/src/dg2/dg2_allreduce.cpp
@@ -527,30 +527,9 @@ ccl::event dg2_ll256_allreduce(const void *src, void *dst, size_t count,
auto chunk_sz = req_workitems * LS_SZ; /* LS_SZ bytes per work-item */
auto chunk_with_pattern = sg_sz * LS_SZ; /* aligned to 256B */

- /* items will be assigned to each rank */
- auto per_rank_items = (unreduced + (local_world_size * LS_SZ - 1)) / (local_world_size * LS_SZ);
- auto req_workgroups = (per_rank_items + (workgroup_available_items - 1)) / workgroup_available_items;
- auto req_subgroups = 0;
-
- if (req_workgroups >= g_sz/l_sz) {
- req_workgroups = g_sz/l_sz;
- } else {
- if (group_id == (req_workgroups - 1)) {
- req_subgroups = (per_rank_items + (sg_sz - 1)) / (sg_sz - 1);
-
- /* (req_subgroups % (l_sz/sg_sz) - 1) equals to the final subgroup id in a workgroup */
- /* Note: req_subgroups % (l_sz/sg_sz) might be 0 */
- if (((req_subgroups % (l_sz/sg_sz)) == 0) || (sg_id == (req_subgroups % (l_sz/sg_sz) - 1))) {
- if ((per_rank_items % (sg_sz - 1)) != 0) {
- /* FIXME: */
- req_workitems = per_rank_items % (sg_sz - 1);
- chunk_sz = req_workitems * LS_SZ; /* LS_SZ bytes per work-item */
- }
- }
- }
- }
+ auto work_left = unreduced - sg_id * local_world_size * chunk_sz;

- if (group_id < req_workgroups) {
+ if (work_left > 0) {
// step 1: push data to next GPU
{
offset = base + local_world_rank * chunk_sz;
--
2.34.1


From 1c58cc9ede5ca38138a270f9e5ff59bca74f51d4 Mon Sep 17 00:00:00 2001
From: Huajun Li <[email protected]>
Date: Wed, 12 Mar 2025 13:29:27 +0800
Subject: [PATCH 3/3] Revert "optimize req_workgroup calculate" for hang issue

This reverts commit 20bfd0e0a37f93dfb8bb9c092cd5a0b35e868bfa.
---
src/dg2/dg2_allreduce.cpp | 25 +++++++++++++++++++++++--
1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/dg2/dg2_allreduce.cpp b/src/dg2/dg2_allreduce.cpp
index 83270ae..15ace74 100644
--- a/src/dg2/dg2_allreduce.cpp
+++ b/src/dg2/dg2_allreduce.cpp
@@ -527,9 +527,30 @@ ccl::event dg2_ll256_allreduce(const void *src, void *dst, size_t count,
auto chunk_sz = req_workitems * LS_SZ; /* LS_SZ bytes per work-item */
auto chunk_with_pattern = sg_sz * LS_SZ; /* aligned to 256B */

- auto work_left = unreduced - sg_id * local_world_size * chunk_sz;
+ /* items will be assigned to each rank */
+ auto per_rank_items = (unreduced + (local_world_size * LS_SZ - 1)) / (local_world_size * LS_SZ);
+ auto req_workgroups = (per_rank_items + (workgroup_available_items - 1)) / workgroup_available_items;
+ auto req_subgroups = 0;
+
+ if (req_workgroups >= g_sz/l_sz) {
+ req_workgroups = g_sz/l_sz;
+ } else {
+ if (group_id == (req_workgroups - 1)) {
+ req_subgroups = (per_rank_items + (sg_sz - 1)) / (sg_sz - 1);
+
+ /* (req_subgroups % (l_sz/sg_sz) - 1) equals to the final subgroup id in a workgroup */
+ /* Note: req_subgroups % (l_sz/sg_sz) might be 0 */
+ if (((req_subgroups % (l_sz/sg_sz)) == 0) || (sg_id == (req_subgroups % (l_sz/sg_sz) - 1))) {
+ if ((per_rank_items % (sg_sz - 1)) != 0) {
+ /* FIXME: */
+ req_workitems = per_rank_items % (sg_sz - 1);
+ chunk_sz = req_workitems * LS_SZ; /* LS_SZ bytes per work-item */
+ }
+ }
+ }
+ }

- if (work_left > 0) {
+ if (group_id < req_workgroups) {
// step 1: push data to next GPU
{
offset = base + local_world_rank * chunk_sz;
--
2.34.1


28 changes: 14 additions & 14 deletions docker/llm/serving/xpu/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,20 @@ RUN set -eux && \
#
# Install Intel PyTorch extension for LLM inference
pip install --pre --upgrade ipex-llm[xpu_2.6] --extra-index-url https://download.pytorch.org/whl/xpu && \
pip install intel-extension-for-pytorch==2.6.10+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
#
# Build torch-ccl
mkdir -p /build && \
cd /build && \
git clone https://github.com/intel/torch-ccl.git && \
cd torch-ccl && \
git checkout ccl_torch2.5.0+xpu && \
git checkout ccl_torch2.6.0+xpu && \
git submodule sync && \
git submodule update --init --recursive && \
# This patch will enable build torch-ccl with pytorch 2.6 environment
git apply /tmp/ccl_torch.patch && \
# git apply /tmp/ccl_torch.patch && \
USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py bdist_wheel && \
# File path: /build/torch-ccl/dist/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl
# File path: /build/torch-ccl/dist/oneccl_bind_pt-2.6.0+xpu-cp311-cp311-linux_x86_64.whl
# Build oneCCL
pip install ninja && \
cd /build/ && \
Expand All @@ -85,7 +86,7 @@ RUN set -eux && \
FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04

# Copy the built torch-ccl package from the build stage
COPY --from=build /build/torch-ccl/dist/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl /opt/
COPY --from=build /build/torch-ccl/dist/oneccl_bind_pt-2.6.0+xpu-cp311-cp311-linux_x86_64.whl /opt/
COPY --from=build /llm/ /llm/
COPY --from=build /build/oneCCL/build/src/libccl.so.1.0 /opt/intel/1ccl-wks/lib/
COPY --from=build /build/oneCCL/build/src/libccl.so.1 /opt/intel/1ccl-wks/lib/
Expand Down Expand Up @@ -144,9 +145,10 @@ RUN set -eux && \
# Install vllm dependencies
pip install --upgrade fastapi && \
pip install --upgrade "uvicorn[standard]" && \
pip install intel-extension-for-pytorch==2.6.10+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
#
# Install torch-ccl
pip install /opt/oneccl_bind_pt-2.5.0+xpu-cp311-cp311-linux_x86_64.whl && \
pip install /opt/oneccl_bind_pt-2.6.0+xpu-cp311-cp311-linux_x86_64.whl && \
#
apt-get update && \
apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev numactl && \
Expand All @@ -168,21 +170,19 @@ RUN set -eux && \
mkdir -p /llm && \
cd /llm && \
rm -rf /tmp/neo && \
# Install intel_extension_for_pytorch
pip install intel-extension-for-pytorch==2.6.10+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ && \
pip uninstall -y oneccl oneccl-devel && \
pip install intel-opencl-rt==2025.0.2 intel-openmp==2025.0.2 && \
#
# Install vllm
git clone -b v0.6.6.post1 https://github.com/vllm-project/vllm /llm/vllm && \
git clone -b v0.8.3 https://github.com/vllm-project/vllm /llm/vllm && \
cd /llm/vllm && \
git apply /llm/vllm_for_multi_arc.patch && \
pip install setuptools-scm && \
pip install setuptools-scm==8.2.0 setuptools==78.1.0 && \
pip install --upgrade cmake && \
VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -v /llm/vllm && \
pip install -v -r requirements/xpu.txt && \
VLLM_TARGET_DEVICE=xpu python setup.py install && \
pip install intel-extension-for-pytorch==2.6.10+xpu --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \
pip uninstall -y oneccl oneccl-devel && \
rm -rf /llm/vllm_for_multi_arc.patch && \
pip install mpi4py fastapi uvicorn openai && \
pip install ray
pip install ray numba


WORKDIR /llm/
Expand Down
3 changes: 3 additions & 0 deletions docker/llm/serving/xpu/docker/start-vllm-service.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ export TORCH_LLM_ALLREDUCE=0
export CCL_SAME_STREAM=1
export CCL_BLOCKING_WAIT=0

export VLLM_USE_V1=0
export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT

source /opt/intel/1ccl-wks/setvars.sh

python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
Expand Down
Loading