Skip to content

Commit 030374b

Browse files
authored
Merge pull request #213 from ROCm/upstream_merge_24_09_27_0.6.2
Upstream merge 24 09 27 0.6.2
2 parents 2d7ab9e + f49394a commit 030374b

File tree

178 files changed

+8261
-3000
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

178 files changed

+8261
-3000
lines changed

.buildkite/test-pipeline.yaml

+21-3
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ steps:
7070
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
7171
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
7272
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
73-
73+
7474
- label: Core Test # 10min
7575
mirror_hardwares: [amd]
7676
fast_check: true
@@ -90,8 +90,11 @@ steps:
9090
commands:
9191
- pip install -e ./plugins/vllm_add_dummy_model
9292
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
93-
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
93+
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
9494
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
95+
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
96+
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
97+
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
9598
- pytest -v -s entrypoints/openai
9699
- pytest -v -s entrypoints/test_chat_utils.py
97100
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
@@ -207,6 +210,21 @@ steps:
207210
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
208211
parallelism: 4
209212

213+
- label: "PyTorch Fullgraph Smoke Test"
214+
fast_check: true
215+
source_file_dependencies:
216+
- vllm/
217+
- tests/compile
218+
commands:
219+
- pytest -v -s compile/test_full_graph_smoke.py
220+
221+
- label: "PyTorch Fullgraph Test"
222+
source_file_dependencies:
223+
- vllm/
224+
- tests/compile
225+
commands:
226+
- pytest -v -s compile/test_full_graph.py
227+
210228
- label: Kernels Test %N # 30min each
211229
mirror_hardwares: [amd]
212230
source_file_dependencies:
@@ -352,7 +370,7 @@ steps:
352370
- tests/distributed/
353371
- vllm/compilation
354372
commands:
355-
- pytest -v -s ./compile/test_full_graph.py
373+
- pytest -v -s ./compile/test_full_graph_multi_gpu.py
356374
- pytest -v -s ./compile/test_wrapper.py
357375
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
358376
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus

.gitignore

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
# vllm commit id, generated by setup.py
2-
vllm/commit_id.py
1+
# version file generated by setuptools-scm
2+
/vllm/_version.py
33

44
# vllm-flash-attn built from source
55
vllm/vllm_flash_attn/

CMakeLists.txt

+6
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
236236
"csrc/quantization/gguf/gguf_kernel.cu"
237237
"csrc/quantization/fp8/fp8_marlin.cu"
238238
"csrc/custom_all_reduce.cu"
239+
"csrc/permute_cols.cu"
239240
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
240241
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
241242
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
@@ -333,6 +334,11 @@ set(VLLM_MOE_EXT_SRC
333334

334335
if(VLLM_GPU_LANG STREQUAL "CUDA")
335336
list(APPEND VLLM_MOE_EXT_SRC
337+
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
338+
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
339+
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
340+
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
341+
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
336342
"csrc/moe/marlin_moe_ops.cu")
337343
endif()
338344

Dockerfile

+3-4
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,13 @@ ENV MAX_JOBS=${max_jobs}
7979
ARG nvcc_threads=8
8080
ENV NVCC_THREADS=$nvcc_threads
8181

82-
ARG buildkite_commit
83-
ENV BUILDKITE_COMMIT=${buildkite_commit}
84-
8582
ARG USE_SCCACHE
8683
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
8784
ARG SCCACHE_REGION_NAME=us-west-2
8885
ARG SCCACHE_S3_NO_CREDENTIALS=0
8986
# if USE_SCCACHE is set, use sccache to speed up compilation
9087
RUN --mount=type=cache,target=/root/.cache/pip \
88+
--mount=type=bind,source=.git,target=.git \
9189
if [ "$USE_SCCACHE" = "1" ]; then \
9290
echo "Installing sccache..." \
9391
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
@@ -107,6 +105,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
107105
ENV CCACHE_DIR=/root/.cache/ccache
108106
RUN --mount=type=cache,target=/root/.cache/ccache \
109107
--mount=type=cache,target=/root/.cache/pip \
108+
--mount=type=bind,source=.git,target=.git \
110109
if [ "$USE_SCCACHE" != "1" ]; then \
111110
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
112111
fi
@@ -203,7 +202,7 @@ FROM vllm-base AS vllm-openai
203202

204203
# install additional dependencies for openai api server
205204
RUN --mount=type=cache,target=/root/.cache/pip \
206-
pip install accelerate hf_transfer 'modelscope!=1.15.0'
205+
pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
207206

208207
ENV VLLM_USAGE_SOURCE production-docker-image
209208

Dockerfile.cpu

+3-1
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,10 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
6262

6363
RUN --mount=type=cache,target=/root/.cache/pip \
6464
--mount=type=cache,target=/root/.cache/ccache \
65+
--mount=type=bind,source=.git,target=.git \
6566
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
66-
pip install dist/*.whl
67+
pip install dist/*.whl && \
68+
rm -rf dist
6769

6870
WORKDIR /workspace/
6971

Dockerfile.neuron

+13-10
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,12 @@ FROM $BASE_IMAGE
66
RUN echo "Base image is $BASE_IMAGE"
77

88
# Install some basic utilities
9-
RUN apt-get update \
10-
&& apt-get install python3 python3-pip -y \
11-
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
9+
RUN apt-get update && \
10+
apt-get install -y \
11+
git \
12+
python3 \
13+
python3-pip \
14+
ffmpeg libsm6 libxext6 libgl1
1215

1316
### Mount Point ###
1417
# When launching the container, mount the code directory to /app
@@ -22,17 +25,17 @@ RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
2225
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
2326
RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
2427

25-
COPY ./vllm /app/vllm/vllm
26-
COPY ./setup.py /app/vllm/setup.py
27-
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
28-
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
28+
COPY . /app/vllm
2929

3030
RUN cd /app/vllm \
31-
&& python3 -m pip install -U -r requirements-neuron.txt
31+
&& python3 -m pip install -U \
32+
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
33+
-r requirements-neuron.txt
3234

3335
ENV VLLM_TARGET_DEVICE neuron
34-
RUN cd /app/vllm \
35-
&& pip install -e . \
36+
RUN --mount=type=bind,source=.git,target=.git \
37+
cd /app/vllm \
38+
&& pip install --no-build-isolation -v -e . \
3639
&& cd ..
3740

3841
CMD ["/bin/bash"]

Dockerfile.openvino

+3-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
FROM ubuntu:22.04 AS dev
55

66
RUN apt-get update -y && \
7-
apt-get install -y python3-pip git && \
8-
apt-get install -y ffmpeg libsm6 libxext6 libgl1
7+
apt-get install -y \
8+
git python3-pip \
9+
ffmpeg libsm6 libxext6 libgl1
910
WORKDIR /workspace
1011

1112
# copy requirements

Dockerfile.ppc64le

+9-3
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,15 @@ COPY ./ /workspace/vllm
1616
WORKDIR /workspace/vllm
1717

1818
# These packages will be in rocketce eventually
19-
RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
20-
21-
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
19+
RUN --mount=type=cache,target=/root/.cache/pip \
20+
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
21+
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
22+
torch==2.3.1 \
23+
-r requirements-cpu.txt \
24+
xformers uvloop==0.20.0
25+
26+
RUN --mount=type=bind,source=.git,target=.git \
27+
VLLM_TARGET_DEVICE=cpu python3 setup.py install
2228

2329
WORKDIR /workspace/
2430

Dockerfile.tpu

+13-4
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,25 @@ FROM $BASE_IMAGE
55
WORKDIR /workspace
66

77
# Install some basic utilities
8-
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1
8+
RUN apt-get update && apt-get install -y \
9+
git \
10+
ffmpeg libsm6 libxext6 libgl1
911

1012
# Install the TPU and Pallas dependencies.
11-
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
12-
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
13+
RUN --mount=type=cache,target=/root/.cache/pip \
14+
python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
15+
RUN --mount=type=cache,target=/root/.cache/pip \
16+
python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
1317

1418
# Build vLLM.
1519
COPY . /workspace/vllm
1620
ENV VLLM_TARGET_DEVICE="tpu"
17-
RUN cd /workspace/vllm && python3 -m pip install -r requirements-tpu.txt
21+
RUN --mount=type=cache,target=/root/.cache/pip \
22+
--mount=type=bind,source=.git,target=.git \
23+
cd /workspace/vllm && \
24+
python3 -m pip install \
25+
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
26+
-r requirements-tpu.txt
1827
RUN cd /workspace/vllm && python3 setup.py develop
1928

2029
CMD ["/bin/bash"]

Dockerfile.xpu

+9-4
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,20 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
77
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
88
chmod 644 /usr/share/keyrings/intel-graphics.gpg
99

10-
RUN apt-get update -y \
11-
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
10+
RUN apt-get update -y && \
11+
apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
1212

1313
COPY ./ /workspace/vllm
1414

1515
WORKDIR /workspace/vllm
1616

17-
RUN pip install -v -r requirements-xpu.txt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
17+
RUN --mount=type=cache,target=/root/.cache/pip \
18+
pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
19+
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
20+
-r requirements-xpu.txt
1821

19-
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
22+
RUN --mount=type=cache,target=/root/.cache/pip \
23+
--mount=type=bind,source=.git,target=.git \
24+
VLLM_TARGET_DEVICE=xpu python3 setup.py install
2025

2126
CMD ["/bin/bash"]

benchmarks/benchmark_latency.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from vllm import LLM, SamplingParams
1313
from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
14-
from vllm.inputs import PromptType
14+
from vllm.inputs import PromptInputs
1515
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
1616
from vllm.utils import FlexibleArgumentParser
1717

@@ -62,7 +62,7 @@ def main(args: argparse.Namespace):
6262
dummy_prompt_token_ids = np.random.randint(10000,
6363
size=(args.batch_size,
6464
args.input_len))
65-
dummy_prompts: List[PromptType] = [{
65+
dummy_inputs: List[PromptInputs] = [{
6666
"prompt_token_ids": batch
6767
} for batch in dummy_prompt_token_ids.tolist()]
6868

@@ -75,13 +75,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
7575
],
7676
on_trace_ready=torch.profiler.tensorboard_trace_handler(
7777
str(profile_dir))) as p:
78-
llm.generate(dummy_prompts,
78+
llm.generate(dummy_inputs,
7979
sampling_params=sampling_params,
8080
use_tqdm=False)
8181
print(p.key_averages())
8282
else:
8383
start_time = time.perf_counter()
84-
llm.generate(dummy_prompts,
84+
llm.generate(dummy_inputs,
8585
sampling_params=sampling_params,
8686
use_tqdm=False)
8787
end_time = time.perf_counter()

0 commit comments

Comments
 (0)