Skip to content

Commit b2c3b22

Browse files
authored
Merge pull request #391 from ROCm/upstream_merge_25_01_27
Upstream merge 25 01 27
2 parents 6b2147f + c8b8654 commit b2c3b22

File tree

221 files changed

+6363
-1987
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

221 files changed

+6363
-1987
lines changed

.buildkite/check-wheel-size.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
import sys
33
import zipfile
44

5-
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
6-
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
5+
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
6+
# Note that we have 400 MiB quota, please use it wisely.
7+
# See https://github.com/pypi/support/issues/3792 .
8+
# Please also sync the value with the one in Dockerfile.
9+
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
710

811

912
def print_top_10_largest_files(zip_file):

.buildkite/run-neuron-test.sh

+4-1
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,11 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
2525
last_build=$(cat /tmp/neuron-docker-build-timestamp)
2626
current_time=$(date +%s)
2727
if [ $((current_time - last_build)) -gt 86400 ]; then
28+
# Remove dangling images (those that are not tagged and not used by any container)
2829
docker image prune -f
29-
docker system prune -f
30+
# Remove unused volumes / force the system prune for old images as well.
31+
docker volume prune -f && docker system prune -f
32+
# Remove huggingface model artifacts and compiler cache
3033
rm -rf "${HF_MOUNT:?}/*"
3134
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
3235
echo "$current_time" > /tmp/neuron-docker-build-timestamp

.buildkite/test-pipeline.yaml

+18-3
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,9 @@ steps:
7676
- tests/basic_correctness/test_basic_correctness
7777
- tests/basic_correctness/test_cpu_offload
7878
- tests/basic_correctness/test_preemption
79+
- tests/basic_correctness/test_cumem.py
7980
commands:
81+
- pytest -v -s basic_correctness/test_cumem.py
8082
- pytest -v -s basic_correctness/test_basic_correctness.py
8183
- pytest -v -s basic_correctness/test_cpu_offload.py
8284
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
@@ -181,7 +183,16 @@ steps:
181183
- vllm/
182184
- tests/v1
183185
commands:
184-
- VLLM_USE_V1=1 pytest -v -s v1
186+
# split the test to avoid interference
187+
- VLLM_USE_V1=1 pytest -v -s v1/core
188+
- VLLM_USE_V1=1 pytest -v -s v1/engine
189+
- VLLM_USE_V1=1 pytest -v -s v1/sample
190+
- VLLM_USE_V1=1 pytest -v -s v1/worker
191+
- VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
192+
- VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
193+
# TODO: accuracy does not match, whether setting
194+
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
195+
- VLLM_USE_V1=1 pytest -v -s v1/e2e
185196

186197
- label: Examples Test # 25min
187198
working_dir: "/vllm-workspace/examples"
@@ -477,7 +488,9 @@ steps:
477488
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
478489
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
479490
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
480-
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
491+
# this test fails consistently.
492+
# TODO: investigate and fix
493+
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
481494
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
482495
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
483496

@@ -515,7 +528,9 @@ steps:
515528
- vllm/engine
516529
- tests/multi_step
517530
commands:
518-
- pytest -v -s multi_step/test_correctness_async_llm.py
531+
# this test is quite flaky
532+
# TODO: investigate and fix.
533+
# - pytest -v -s multi_step/test_correctness_async_llm.py
519534
- pytest -v -s multi_step/test_correctness_llm.py
520535

521536
- label: Pipeline Parallelism Test # 45min

.github/CODEOWNERS

+15-12
Original file line numberDiff line numberDiff line change
@@ -2,32 +2,35 @@
22
# for more info about CODEOWNERS file
33

44
# This lists cover the "core" components of vLLM that require careful review
5-
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
6-
/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
7-
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
8-
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
9-
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
10-
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
11-
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
5+
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
6+
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
7+
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
8+
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
9+
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
10+
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
11+
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
12+
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
13+
/vllm/model_executor/guided_decoding @mgoin
14+
/vllm/multimodal @DarkLight1337 @ywang96
1215
CMakeLists.txt @tlrmchlsmth
1316

1417
# vLLM V1
15-
/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
18+
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
1619

1720
# Test ownership
18-
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
21+
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
1922
/tests/test_inputs.py @DarkLight1337 @ywang96
20-
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
23+
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
2124
/tests/models @DarkLight1337 @ywang96
2225
/tests/multimodal @DarkLight1337 @ywang96
2326
/tests/prefix_caching @comaniac @KuntaiDu
2427
/tests/spec_decode @njhill @LiuXiaoxuanPKU
2528
/tests/kernels @tlrmchlsmth @WoosukKwon
26-
/tests/quantization @mgoin @robertgshaw2-neuralmagic
29+
/tests/quantization @mgoin @robertgshaw2-redhat
2730
/.buildkite/lm-eval-harness @mgoin @simon-mo
2831
/tests/distributed/test_multi_node_assignment.py @youkaichao
2932
/tests/distributed/test_pipeline_parallel.py @youkaichao
3033
/tests/distributed/test_same_node.py @youkaichao
31-
/tests/multi_step @alexm-neuralmagic @comaniac
34+
/tests/multi_step @alexm-redhat @comaniac
3235
/tests/weight_loading @mgoin @youkaichao
3336
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac

CMakeLists.txt

100644100755
+53-29
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
2424
# Suppress potential warnings about unused manually-specified variables
2525
set(ignoreMe "${VLLM_PYTHON_PATH}")
2626

27-
# Prevent installation of dependencies (cutlass) by default.
28-
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
29-
3027
#
3128
# Supported python versions. These versions will be searched in order, the
3229
# first match will be selected. These should be kept in sync with setup.py.
@@ -215,6 +212,31 @@ endif()
215212
# Define extension targets
216213
#
217214

215+
#
216+
# cumem_allocator extension
217+
#
218+
219+
set(VLLM_CUMEM_EXT_SRC
220+
"csrc/cumem_allocator.cpp")
221+
222+
set_gencode_flags_for_srcs(
223+
SRCS "${VLLM_CUMEM_EXT_SRC}"
224+
CUDA_ARCHS "${CUDA_ARCHS}")
225+
226+
if(VLLM_GPU_LANG STREQUAL "CUDA")
227+
message(STATUS "Enabling cumem allocator extension.")
228+
# link against cuda driver library
229+
list(APPEND CUMEM_LIBS cuda)
230+
define_gpu_extension_target(
231+
cumem_allocator
232+
DESTINATION vllm
233+
LANGUAGE CXX
234+
SOURCES ${VLLM_CUMEM_EXT_SRC}
235+
LIBRARIES ${CUMEM_LIBS}
236+
USE_SABI 3.8
237+
WITH_SOABI)
238+
endif()
239+
218240
#
219241
# _C extension
220242
#
@@ -287,7 +309,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
287309
# Only build Marlin kernels if we are building for at least some compatible archs.
288310
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
289311
# are not supported by Machete yet.
290-
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
312+
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
291313
if (MARLIN_ARCHS)
292314
set(MARLIN_SRCS
293315
"csrc/quantization/fp8/fp8_marlin.cu"
@@ -308,8 +330,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
308330
endif()
309331

310332
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
311-
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
312-
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
333+
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
334+
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
313335
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
314336
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
315337
set_gencode_flags_for_srcs(
@@ -363,7 +385,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
363385
# 2:4 Sparse Kernels
364386

365387
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
366-
# require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
388+
# require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
367389
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
368390
set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
369391
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
@@ -463,6 +485,9 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
463485
endif()
464486

465487
message(STATUS "Enabling C extension.")
488+
if(VLLM_GPU_LANG STREQUAL "CUDA")
489+
list(APPEND VLLM_C_LIBS cuda)
490+
endif()
466491
define_gpu_extension_target(
467492
_C
468493
DESTINATION vllm
@@ -471,6 +496,7 @@ define_gpu_extension_target(
471496
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
472497
ARCHITECTURES ${VLLM_GPU_ARCHES}
473498
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
499+
LIBRARIES ${VLLM_C_LIBS}
474500
USE_SABI 3
475501
WITH_SOABI)
476502

@@ -570,7 +596,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
570596
endif()
571597

572598
# vllm-flash-attn currently only supported on CUDA
573-
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
599+
if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
574600
return()
575601
endif ()
576602

@@ -593,7 +619,7 @@ endif()
593619
# They should be identical but if they aren't, this is a massive footgun.
594620
#
595621
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
596-
# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
622+
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
597623
# If no component is specified, vllm-flash-attn is still installed.
598624

599625
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
@@ -605,43 +631,41 @@ if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
605631
endif()
606632

607633
if(VLLM_FLASH_ATTN_SRC_DIR)
608-
FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
634+
FetchContent_Declare(
635+
vllm-flash-attn SOURCE_DIR
636+
${VLLM_FLASH_ATTN_SRC_DIR}
637+
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
638+
)
609639
else()
610640
FetchContent_Declare(
611641
vllm-flash-attn
612642
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
613-
GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
643+
GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
614644
GIT_PROGRESS TRUE
615645
# Don't share the vllm-flash-attn build between build types
616646
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
617647
)
618648
endif()
619649

620-
# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
621-
set(VLLM_PARENT_BUILD ON)
622-
623-
# Ensure the vllm/vllm_flash_attn directory exists before installation
624-
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
625-
626-
# Make sure vllm-flash-attn install rules are nested under vllm/
627-
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
628-
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
629-
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
630650

631651
# Fetch the vllm-flash-attn library
632652
FetchContent_MakeAvailable(vllm-flash-attn)
633653
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
634654

635-
# Restore the install prefix
636-
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
637-
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
655+
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
656+
# case only one is built, in the case both are built redundant work is done)
657+
install(
658+
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
659+
DESTINATION vllm_flash_attn
660+
COMPONENT _vllm_fa2_C
661+
FILES_MATCHING PATTERN "*.py"
662+
)
638663

639-
# Copy over the vllm-flash-attn python files
640664
install(
641-
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
642-
DESTINATION vllm/vllm_flash_attn
643-
COMPONENT vllm_flash_attn_c
644-
FILES_MATCHING PATTERN "*.py"
665+
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
666+
DESTINATION vllm_flash_attn
667+
COMPONENT _vllm_fa3_C
668+
FILES_MATCHING PATTERN "*.py"
645669
)
646670

647671
# Nothing after vllm-flash-attn, see comment about macros above

Dockerfile

+24-5
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ WORKDIR /workspace
5252
# after this step
5353
RUN --mount=type=cache,target=/root/.cache/pip \
5454
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
55-
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
55+
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
5656
fi
5757

5858
COPY requirements-common.txt requirements-common.txt
@@ -126,8 +126,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
126126

127127
# Check the size of the wheel if RUN_WHEEL_CHECK is true
128128
COPY .buildkite/check-wheel-size.py check-wheel-size.py
129-
# Default max size of the wheel is 250MB
130-
ARG VLLM_MAX_SIZE_MB=250
129+
# sync the default value with .buildkite/check-wheel-size.py
130+
ARG VLLM_MAX_SIZE_MB=300
131131
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
132132
ARG RUN_WHEEL_CHECK=true
133133
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@@ -149,7 +149,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
149149

150150
#################### vLLM installation IMAGE ####################
151151
# image with vLLM installed
152-
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
152+
# TODO: Restore to base image after FlashInfer AOT wheel fixed
153+
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
153154
ARG CUDA_VERSION=12.4.1
154155
ARG PYTHON_VERSION=3.12
155156
WORKDIR /vllm-workspace
@@ -194,12 +195,30 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
194195
--mount=type=cache,target=/root/.cache/pip \
195196
python3 -m pip install dist/*.whl --verbose
196197

198+
# How to build this FlashInfer wheel:
199+
# $ export FLASHINFER_ENABLE_AOT=1
200+
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
201+
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
202+
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
203+
# $ cd flashinfer
204+
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
205+
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
206+
197207
RUN --mount=type=cache,target=/root/.cache/pip \
198208
. /etc/environment && \
199209
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
200-
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
210+
python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
201211
fi
202212
COPY examples examples
213+
214+
# Although we build Flashinfer with AOT mode, there's still
215+
# some issues w.r.t. JIT compilation. Therefore we need to
216+
# install build dependencies for JIT compilation.
217+
# TODO: Remove this once FlashInfer AOT wheel is fixed
218+
COPY requirements-build.txt requirements-build.txt
219+
RUN --mount=type=cache,target=/root/.cache/pip \
220+
python3 -m pip install -r requirements-build.txt
221+
203222
#################### vLLM installation IMAGE ####################
204223

205224
#################### TEST IMAGE ####################

0 commit comments

Comments
 (0)