Skip to content

Commit 62334b5

Browse files
authored
Merge pull request #286 from ROCm/upstream_merge_24_11_18
Upstream merge 24 11 18
2 parents 4a185d8 + 8f7daff commit 62334b5

File tree

301 files changed

+12297
-3765
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

301 files changed

+12297
-3765
lines changed

.buildkite/release-pipeline.yaml

+8-13
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,23 @@ steps:
66
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
77
- "mkdir artifacts"
88
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
9-
# rename the files to change linux -> manylinux1
10-
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
11-
- "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
12-
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
13-
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
9+
- "bash .buildkite/upload-wheels.sh"
1410
env:
1511
DOCKER_BUILDKIT: "1"
1612

17-
- block: "Build CUDA 11.8 wheel"
18-
key: block-build-cu118-wheel
19-
13+
# Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
14+
# However, this block can be uncommented to save some compute hours.
15+
# - block: "Build CUDA 11.8 wheel"
16+
# key: block-build-cu118-wheel
17+
2018
- label: "Build wheel - CUDA 11.8"
21-
depends_on: block-build-cu118-wheel
19+
# depends_on: block-build-cu118-wheel
2220
agents:
2321
queue: cpu_queue
2422
commands:
2523
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
2624
- "mkdir artifacts"
2725
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
28-
# rename the files to change linux -> manylinux1
29-
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
30-
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
31-
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
26+
- "bash .buildkite/upload-wheels.sh"
3227
env:
3328
DOCKER_BUILDKIT: "1"

.buildkite/run-cpu-test-ppc64le.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ function cpu_tests() {
2727
decord einops librosa peft Pillow sentence-transformers soundfile \
2828
transformers_stream_generator matplotlib datamodel_code_generator
2929
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
30-
pytest -v -s tests/models/embedding/language
31-
pytest -v -s tests/models/encoder_decoder/language
32-
pytest -v -s tests/models/decoder_only/language/test_models.py
30+
pytest -v -s tests/models/decoder_only/language -m cpu_model
31+
pytest -v -s tests/models/embedding/language -m cpu_model
32+
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
3333
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
3434
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
3535

.buildkite/run-cpu-test.sh

+15-11
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,24 @@
44
# It serves a sanity check for compilation and basic model usage.
55
set -ex
66

7+
# allow to bind to different cores
8+
CORE_RANGE=${CORE_RANGE:-48-95}
9+
NUMA_NODE=${NUMA_NODE:-1}
10+
711
# Try building the docker image
8-
numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
9-
numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
12+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
13+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
1014

1115
# Setup cleanup
1216
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
1317
trap remove_docker_container EXIT
1418
remove_docker_container
1519

1620
# Run the image, setting --shm-size=4g for tensor parallel.
17-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
18-
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
19-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
20-
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
21+
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
22+
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
23+
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
24+
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
2125

2226
function cpu_tests() {
2327
set -e
@@ -34,9 +38,9 @@ function cpu_tests() {
3438
decord einops librosa peft Pillow sentence-transformers soundfile \
3539
transformers_stream_generator matplotlib datamodel_code_generator
3640
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
37-
pytest -v -s tests/models/embedding/language
38-
pytest -v -s tests/models/encoder_decoder/language
39-
pytest -v -s tests/models/decoder_only/language/test_models.py
41+
pytest -v -s tests/models/decoder_only/language -m cpu_model
42+
pytest -v -s tests/models/embedding/language -m cpu_model
43+
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
4044
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
4145
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
4246

@@ -57,7 +61,7 @@ function cpu_tests() {
5761
docker exec cpu-test bash -c "
5862
set -e
5963
export VLLM_CPU_KVCACHE_SPACE=10
60-
export VLLM_CPU_OMP_THREADS_BIND=48-92
64+
export VLLM_CPU_OMP_THREADS_BIND=$1
6165
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
6266
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
6367
python3 benchmarks/benchmark_serving.py \
@@ -71,4 +75,4 @@ function cpu_tests() {
7175

7276
# All of CPU tests are expected to be finished less than 25 mins.
7377
export -f cpu_tests
74-
timeout 25m bash -c "cpu_tests"
78+
timeout 25m bash -c "cpu_tests $CORE_RANGE"

.buildkite/run-hpu-test.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
1313
remove_docker_container
1414

1515
# Run the image and launch offline inference
16-
docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
16+
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py

.buildkite/test-pipeline.yaml

+34-27
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,14 @@ steps:
165165
# OOM in the CI unless we run this separately
166166
- pytest -v -s tokenization
167167

168+
- label: V1 Test
169+
#mirror_hardwares: [amd]
170+
source_file_dependencies:
171+
- vllm/
172+
- tests/v1
173+
commands:
174+
- pytest -v -s v1
175+
168176
- label: Examples Test # 15min
169177
working_dir: "/vllm-workspace/examples"
170178
#mirror_hardwares: [amd]
@@ -305,71 +313,70 @@ steps:
305313

306314
##### models test #####
307315

308-
- label: Basic Models Test # 10min
316+
- label: Basic Models Test # 30min
309317
source_file_dependencies:
310318
- vllm/
311319
- tests/models
312320
commands:
313321
- pip install -e ./plugins/vllm_add_dummy_model
314322
- pytest -v -s models/test_oot_registration.py # it needs a clean process
315-
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
323+
- pytest -v -s models/test_registry.py
324+
- pytest -v -s models/test_initialization.py
316325

317-
- label: Decoder-only Language Models Test (Standard) # 18min
326+
- label: Language Models Test (Standard) # 42min
318327
#mirror_hardwares: [amd]
319328
source_file_dependencies:
320329
- vllm/
321330
- tests/models/decoder_only/language
331+
- tests/models/embedding/language
332+
- tests/models/encoder_decoder/language
322333
commands:
323-
- pytest -v -s models/decoder_only/language -m core_model
324-
- pytest -v -s models/decoder_only/language -m quant_model
334+
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
335+
- pytest -v -s models/embedding/language -m core_model
336+
- pytest -v -s models/embedding/vision_language -m core_model
325337

326-
- label: Decoder-only Language Models Test (Extended) # 46min
338+
- label: Language Models Test (Extended) # 50min
327339
nightly: true
328340
source_file_dependencies:
329341
- vllm/
330342
- tests/models/decoder_only/language
343+
- tests/models/embedding/language
344+
- tests/models/encoder_decoder/language
331345
commands:
332346
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
347+
- pytest -v -s models/embedding/language -m 'not core_model'
348+
- pytest -v -s models/embedding/vision_language -m 'not core_model'
333349

334-
- label: Decoder-only Multi-Modal Models Test (Standard) # 22min
350+
- label: Multi-Modal Models Test (Standard) # 26min
335351
#mirror_hardwares: [amd]
336352
source_file_dependencies:
337353
- vllm/
338354
- tests/models/decoder_only/audio_language
339355
- tests/models/decoder_only/vision_language
356+
- tests/models/embedding/vision_language
357+
- tests/models/encoder_decoder/vision_language
340358
commands:
341-
- pytest -v -s models/decoder_only/audio_language -m core_model
342-
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model
343-
# No tests under this group for now
344-
# - pytest -v -s models/decoder_only/audio_language -m quant_model
345-
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m quant_model
359+
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
360+
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
361+
- pytest -v -s models/encoder_decoder/language -m core_model
362+
- pytest -v -s models/encoder_decoder/vision_language -m core_model
346363

347-
- label: Decoder-only Multi-Modal Models Test (Extended) # 1h10m
364+
- label: Multi-Modal Models Test (Extended) # 1h15m
348365
nightly: true
349366
source_file_dependencies:
350367
- vllm/
351368
- tests/models/decoder_only/audio_language
352369
- tests/models/decoder_only/vision_language
370+
- tests/models/embedding/vision_language
371+
- tests/models/encoder_decoder/vision_language
353372
commands:
354373
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
355374
# HACK - run phi3v tests separately to sidestep this transformers bug
356375
# https://github.com/huggingface/transformers/issues/34307
357376
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
358377
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
359-
360-
- label: Other Models Test # 20min
361-
#mirror_hardwares: [amd]
362-
source_file_dependencies:
363-
- vllm/
364-
- tests/models/embedding/language
365-
- tests/models/embedding/vision_language
366-
- tests/models/encoder_decoder/language
367-
- tests/models/encoder_decoder/vision_language
368-
commands:
369-
- pytest -v -s models/embedding/language
370-
- pytest -v -s models/embedding/vision_language
371-
- pytest -v -s models/encoder_decoder/language
372-
- pytest -v -s models/encoder_decoder/vision_language
378+
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
379+
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
373380

374381
# This test is used only in PR development phase to test individual models and should never run on main
375382
- label: Custom Models Test

.buildkite/upload-wheels.sh

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/usr/bin/env bash
2+
3+
set -ex
4+
5+
# Assume wheels are in artifacts/dist/*.whl
6+
wheel_files=(artifacts/dist/*.whl)
7+
8+
# Check that exactly one wheel is found
9+
if [[ ${#wheel_files[@]} -ne 1 ]]; then
10+
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
11+
exit 1
12+
fi
13+
14+
# Get the single wheel file
15+
wheel="${wheel_files[0]}"
16+
17+
# Rename 'linux' to 'manylinux1' in the wheel filename
18+
new_wheel="${wheel/linux/manylinux1}"
19+
mv -- "$wheel" "$new_wheel"
20+
wheel="$new_wheel"
21+
22+
# Extract the version from the wheel
23+
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
24+
echo "Version: $version"
25+
26+
# If the version contains "dev", rename it to v1.0.0.dev for consistency
27+
if [[ $version == *dev* ]]; then
28+
new_version="1.0.0.dev"
29+
new_wheel="${wheel/$version/$new_version}"
30+
mv -- "$wheel" "$new_wheel"
31+
wheel="$new_wheel"
32+
version="$new_version"
33+
fi
34+
35+
# Upload the wheel to S3
36+
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
37+
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
38+
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"

.github/CODEOWNERS

+10-7
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,16 @@
33

44
# This lists cover the "core" components of vLLM that require careful review
55
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
6-
/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
7-
/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
8-
/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
9-
/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
10-
/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
11-
/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
12-
CMakeLists.txt @tlrmchlsmth @WoosukKwon
6+
/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
7+
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
8+
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
9+
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
10+
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
11+
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
12+
CMakeLists.txt @tlrmchlsmth
13+
14+
# vLLM V1
15+
/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
1316

1417
# Test ownership
1518
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo

0 commit comments

Comments
 (0)