Skip to content

Commit b758abb

Browse files
authored
Merge pull request #478 from ROCm/unified_fp8
upstream_merge_25_03_12
2 parents 1aec156 + d7657c2 commit b758abb

File tree

154 files changed

+7760
-1206
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

154 files changed

+7760
-1206
lines changed

.buildkite/release-pipeline.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ steps:
5757
agents:
5858
queue: tpu_queue_postmerge
5959
commands:
60+
- "rm /var/log/syslog"
61+
- "rm /var/log/kern.log"
6062
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
6163
- "docker push vllm/vllm-tpu:nightly"
6264
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"

.buildkite/run-cpu-test.sh

+5-3
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,14 @@ remove_docker_container
1919

2020
# Run the image, setting --shm-size=4g for tensor parallel.
2121
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
22-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
22+
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
2323
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
24-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
24+
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
2525

2626
function cpu_tests() {
2727
set -e
2828
export NUMA_NODE=$2
29+
export BUILDKITE_BUILD_NUMBER=$3
2930

3031
# offline inference
3132
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
@@ -36,6 +37,7 @@ function cpu_tests() {
3637
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
3738
set -e
3839
pip install -r vllm/requirements/test.txt
40+
pip install -r vllm/requirements/cpu.txt
3941
pytest -v -s tests/models/decoder_only/language -m cpu_model
4042
pytest -v -s tests/models/embedding/language -m cpu_model
4143
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
@@ -85,4 +87,4 @@ function cpu_tests() {
8587

8688
# All of CPU tests are expected to be finished less than 40 mins.
8789
export -f cpu_tests
88-
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
90+
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"

.buildkite/run-tpu-test.sh

-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ docker run --privileged --net host --shm-size=16G -it \
1919
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
2020
&& python3 -m pip install pytest \
2121
&& python3 -m pip install lm_eval[api]==0.4.4 \
22-
&& pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
2322
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
2423
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
2524
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \

.github/workflows/scripts/create_release.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Uses Github's API to create the release and wait for result.
1+
// Uses GitHub's API to create the release and wait for result.
22
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
33

44
module.exports = async (github, context, core) => {

CMakeLists.txt

+10-15
Original file line numberDiff line numberDiff line change
@@ -156,20 +156,6 @@ endif()
156156
#
157157
get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
158158

159-
#
160-
# Get supported FP8 format based on GPU arches
161-
#
162-
get_supported_fp8_format(FP8_FORMAT ${VLLM_GPU_LANG} "${VLLM_GPU_ARCHES}")
163-
if(${FP8_FORMAT} STREQUAL "E4M3FN")
164-
message(STATUS "FP8 format: E4M3FN")
165-
list(APPEND VLLM_GPU_FLAGS "-DUSE_CUDA_FP8_FORMAT")
166-
elseif(${FP8_FORMAT} STREQUAL "E4M3FNUZ")
167-
message(STATUS "FP8 format: E4M3FNUZ")
168-
list(APPEND VLLM_GPU_FLAGS "-DUSE_HIP_FP8_FORMAT")
169-
elseif(${FP8_FORMAT} STREQUAL "CONFLICT")
170-
message(FATAL_ERROR "Target architectures support different types of FP8 formats!")
171-
endif()
172-
173159
#
174160
# Set nvcc parallelism.
175161
#
@@ -437,7 +423,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
437423
# 2:4 Sparse Kernels
438424

439425
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
440-
# require CUDA 12.2 or later (and only work on Hopper and Blackwell).
426+
# require CUDA 12.2 or later (and only work on Hopper).
427+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
441428
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
442429
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
443430
set_gencode_flags_for_srcs(
@@ -585,6 +572,14 @@ set_gencode_flags_for_srcs(
585572
CUDA_ARCHS "${CUDA_ARCHS}")
586573

587574
if(VLLM_GPU_LANG STREQUAL "CUDA")
575+
set(VLLM_MOE_WNA16_SRC
576+
"csrc/moe/moe_wna16.cu")
577+
578+
set_gencode_flags_for_srcs(
579+
SRCS "${VLLM_MOE_WNA16_SRC}"
580+
CUDA_ARCHS "${CUDA_ARCHS}")
581+
582+
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
588583
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
589584
if (MARLIN_MOE_ARCHS)
590585
set(MARLIN_MOE_SRC

Dockerfile.cpu

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
2222

2323
RUN echo 'ulimit -c 0' >> ~/.bashrc
2424

25-
RUN pip install intel_extension_for_pytorch==2.5.0
25+
RUN pip install intel_extension_for_pytorch==2.6.0
2626

2727
WORKDIR /workspace
2828

Dockerfile.tpu

+3
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ ARG GIT_REPO_CHECK=0
1515
RUN --mount=type=bind,source=.git,target=.git \
1616
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
1717

18+
# Remove existing versions of dependencies
19+
RUN pip uninstall -y torch torch_xla torchvision
20+
1821
ENV VLLM_TARGET_DEVICE="tpu"
1922
RUN --mount=type=cache,target=/root/.cache/pip \
2023
--mount=type=bind,source=.git,target=.git \

Dockerfile.xpu

+10-12
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
1+
FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
22

33
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
44
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
@@ -21,7 +21,8 @@ RUN apt-get update -y && \
2121
python3 \
2222
python3-dev \
2323
python3-pip \
24-
# vim \
24+
libze-intel-gpu-dev \
25+
libze-intel-gpu1 \
2526
wget
2627

2728
WORKDIR /workspace/vllm
@@ -32,19 +33,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
3233
pip install --no-cache-dir \
3334
-r requirements/xpu.txt
3435

35-
RUN git clone https://github.com/intel/pti-gpu && \
36-
cd pti-gpu/sdk && \
37-
git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
38-
mkdir build && \
39-
cd build && \
40-
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
41-
make -j && \
42-
cmake --install . --config Release --prefix "/usr/local"
43-
4436
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
4537

4638
COPY . .
47-
ARG GIT_REPO_CHECK
39+
ARG GIT_REPO_CHECK=0
4840
RUN --mount=type=bind,source=.git,target=.git \
4941
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
5042

@@ -54,6 +46,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
5446
--mount=type=bind,source=.git,target=.git \
5547
python3 setup.py install
5648

49+
# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
50+
# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
51+
RUN --mount=type=cache,target=/root/.cache/pip \
52+
pip install intel-extension-for-pytorch==2.6.10+xpu \
53+
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
54+
5755
CMD ["/bin/bash"]
5856

5957
FROM vllm-base AS vllm-openai

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ pip install vllm
9191
```
9292

9393
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
94-
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
94+
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
9595
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
9696
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
9797

@@ -151,9 +151,9 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
151151

152152
## Contact Us
153153

154-
- For technical questions and feature requests, please use Github issues or discussions.
154+
- For technical questions and feature requests, please use GitHub issues or discussions.
155155
- For discussing with fellow users and coordinating contributions and development, please use Slack.
156-
- For security disclosures, please use Github's security advisory feature.
156+
- For security disclosures, please use GitHub's security advisory feature.
157157
- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
158158

159159
## Media Kit

benchmarks/README.md

+165-13
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,181 @@
11
# Benchmarking vLLM
22

3-
## Downloading the ShareGPT dataset
3+
This README guides you through running benchmark tests with the extensive
4+
datasets supported on vLLM. It’s a living document, updated as new features and datasets
5+
become available.
46

5-
You can download the dataset by running:
7+
## Dataset Overview
8+
9+
<table style="width:100%; border-collapse: collapse;">
10+
<thead>
11+
<tr>
12+
<th style="width:15%; text-align: left;">Dataset</th>
13+
<th style="width:10%; text-align: center;">Online</th>
14+
<th style="width:10%; text-align: center;">Offline</th>
15+
<th style="width:65%; text-align: left;">Data Path</th>
16+
</tr>
17+
</thead>
18+
<tbody>
19+
<tr>
20+
<td><strong>ShareGPT</strong></td>
21+
<td style="text-align: center;">✅</td>
22+
<td style="text-align: center;">✅</td>
23+
<td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
24+
</tr>
25+
<tr>
26+
<td><strong>BurstGPT</strong></td>
27+
<td style="text-align: center;">✅</td>
28+
<td style="text-align: center;">✅</td>
29+
<td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
30+
</tr>
31+
<tr>
32+
<td><strong>Sonnet</strong></td>
33+
<td style="text-align: center;">✅</td>
34+
<td style="text-align: center;">✅</td>
35+
<td>Local file: <code>benchmarks/sonnet.txt</code></td>
36+
</tr>
37+
<tr>
38+
<td><strong>Random</strong></td>
39+
<td style="text-align: center;">✅</td>
40+
<td style="text-align: center;">✅</td>
41+
<td><code>synthetic</code></td>
42+
</tr>
43+
<tr>
44+
<td><strong>HuggingFace</strong></td>
45+
<td style="text-align: center;">✅</td>
46+
<td style="text-align: center;">🚧</td>
47+
<td>Specify your dataset path on HuggingFace</td>
48+
</tr>
49+
<tr>
50+
<td><strong>VisionArena</strong></td>
51+
<td style="text-align: center;">✅</td>
52+
<td style="text-align: center;">🚧</td>
53+
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
54+
</tr>
55+
</tbody>
56+
</table>
57+
✅: supported
58+
🚧: to be supported
59+
60+
**Note**: VisionArena’s `dataset-name` should be set to `hf`
61+
62+
---
63+
## Example - Online Benchmark
64+
65+
First start serving your model
666

767
```bash
8-
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
68+
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
69+
vllm serve ${MODEL_NAME} --disable-log-requests
970
```
1071

11-
## Downloading the ShareGPT4V dataset
72+
Then run the benchmarking script
73+
74+
```bash
75+
# download dataset
76+
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
77+
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
78+
NUM_PROMPTS=10
79+
BACKEND="openai-chat"
80+
DATASET_NAME="sharegpt"
81+
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
82+
python3 benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
83+
```
84+
85+
If successful, you will see the following output
86+
87+
```
88+
============ Serving Benchmark Result ============
89+
Successful requests: 10
90+
Benchmark duration (s): 5.78
91+
Total input tokens: 1369
92+
Total generated tokens: 2212
93+
Request throughput (req/s): 1.73
94+
Output token throughput (tok/s): 382.89
95+
Total Token throughput (tok/s): 619.85
96+
---------------Time to First Token----------------
97+
Mean TTFT (ms): 71.54
98+
Median TTFT (ms): 73.88
99+
P99 TTFT (ms): 79.49
100+
-----Time per Output Token (excl. 1st token)------
101+
Mean TPOT (ms): 7.91
102+
Median TPOT (ms): 7.96
103+
P99 TPOT (ms): 8.03
104+
---------------Inter-token Latency----------------
105+
Mean ITL (ms): 7.74
106+
Median ITL (ms): 7.70
107+
P99 ITL (ms): 8.39
108+
==================================================
109+
```
12110

13-
The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
14-
will ignore a datapoint if the referred image is missing.
111+
### VisionArena Benchmark for Vision Language Models
15112

16113
```bash
17-
wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
18-
mkdir coco -p
19-
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
20-
unzip coco/train2017.zip -d coco/
114+
# need a model with vision capability here
115+
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
21116
```
22117

23-
# Downloading the BurstGPT dataset
118+
```bash
119+
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
120+
NUM_PROMPTS=10
121+
BACKEND="openai-chat"
122+
DATASET_NAME="hf"
123+
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
124+
DATASET_SPLIT='train'
125+
126+
python3 benchmarks/benchmark_serving.py \
127+
--backend "${BACKEND}" \
128+
--model "${MODEL_NAME}" \
129+
--endpoint "/v1/chat/completions" \
130+
--dataset-name "${DATASET_NAME}" \
131+
--dataset-path "${DATASET_PATH}" \
132+
--hf-split "${DATASET_SPLIT}" \
133+
--num-prompts "${NUM_PROMPTS}"
134+
```
24135

25-
You can download the BurstGPT v1.1 dataset by running:
136+
---
137+
## Example - Offline Throughput Benchmark
26138

27139
```bash
28-
wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
140+
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
141+
NUM_PROMPTS=10
142+
DATASET_NAME="sonnet"
143+
DATASET_PATH="benchmarks/sonnet.txt"
144+
145+
python3 benchmarks/benchmark_throughput.py \
146+
--model "${MODEL_NAME}" \
147+
--dataset-name "${DATASET_NAME}" \
148+
--dataset-path "${DATASET_PATH}" \
149+
--num-prompts "${NUM_PROMPTS}"
150+
```
151+
152+
If successful, you will see the following output
153+
154+
```
155+
Throughput: 7.35 requests/s, 4789.20 total tokens/s, 1102.83 output tokens/s
29156
```
157+
158+
### Benchmark with LoRA Adapters
159+
160+
``` bash
161+
MODEL_NAME="meta-llama/Llama-2-7b-hf"
162+
BACKEND="vllm"
163+
DATASET_NAME="sharegpt"
164+
DATASET_PATH="/home/jovyan/data/vllm_benchmark_datasets/ShareGPT_V3_unfiltered_cleaned_split.json"
165+
NUM_PROMPTS=10
166+
MAX_LORAS=2
167+
MAX_LORA_RANK=8
168+
ENABLE_LORA="--enable-lora"
169+
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
170+
171+
python3 benchmarks/benchmark_throughput.py \
172+
--model "${MODEL_NAME}" \
173+
--backend "${BACKEND}" \
174+
--dataset_path "${DATASET_PATH}" \
175+
--dataset_name "${DATASET_NAME}" \
176+
--num-prompts "${NUM_PROMPTS}" \
177+
--max-loras "${MAX_LORAS}" \
178+
--max-lora-rank "${MAX_LORA_RANK}" \
179+
${ENABLE_LORA} \
180+
--lora-path "${LORA_PATH}"
181+
```

0 commit comments

Comments
 (0)