Skip to content

[CI] Enable model revisions in GHA test #2124

[CI] Enable model revisions in GHA test

[CI] Enable model revisions in GHA test #2124

Workflow file for this run

name: Test
on:
# Don't use `paths` or `paths-ignore` filter since this workflow is required
# for all pull requests on main irrespective of file type or location
# Use `changed-src-files` step to determine if source code was changed
pull_request:
# add labeled and unlabeled to the default types (runs when label is added)
types: [opened, synchronize, reopened, labeled, unlabeled, auto_merge_enabled]
branches: [main]
push:
branches: [main]
workflow_dispatch:
env:
FORCE_COLOR: "1"
VLLM_CPU_DISABLE_AVX512: "true"
VLLM_TARGET_DEVICE: "empty"
VLLM_PLUGINS: "spyre"
HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"
DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
test:
timeout-minutes: 20
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: ["ubuntu-latest"]
python_version: ["3.12"]
vllm_version:
- name: "default"
repo: ""
- name: "vLLM:main"
repo: "git+https://github.com/vllm-project/vllm --branch main"
test_suite:
- name: "static batching"
markers: "cpu and decoder and not cb and not other_e2e and not quantized"
flags: "--timeout=300"
hf_model: "JackFram/llama-160m"
- name: "fp8"
markers: "cpu and quantized and multi"
flags: "--timeout=600 -k 'basic and test_output' --durations=0"
hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
hf_model_rev: "0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e"
- name: "embedding"
markers: "cpu and embedding and not quantized"
flags: "--timeout=300"
hf_model: "sentence-transformers/all-roberta-large-v1"
hf_model_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
- name: "scoring"
markers: "cpu and scoring"
flags: "--timeout=300"
hf_model: "cross-encoder/stsb-roberta-large"
hf_model_rev: "2b12c2c0088918e76151fd5937b7bba986ef1f98"
- name: "continuous batching"
markers: "cpu and cb and not quantized"
flags: "--timeout=300 --durations=0 -s"
- name: "worker and utils"
markers: "not e2e and not quantized"
flags: "--timeout=300"
- name: "compatibility"
markers: "compat"
flags: "--timeout=300"
- name: "other e2e"
markers: "cpu and other_e2e and not quantized"
flags: "--timeout=300"
- name: "precompilation"
markers: "precompilation and not quantized"
flags: "--timeout=300"
include:
- vllm_version:
name: "vLLM:lowest"
repo: "git+https://github.com/vllm-project/vllm --tag v0.10.2"
test_suite:
name: "backward compat"
markers: "compat or (cpu and basic)"
flags: "--timeout=300"
hf_model_2: "sentence-transformers/all-roberta-large-v1"
hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
os: "ubuntu-latest"
python_version: "3.12"
# Exclude vLLM:main if PR does NOT have "ready" label AND auto-merge is not enabled
exclude: >-
${{
github.event_name == 'pull_request' &&
!(
contains(toJson(github.event.pull_request.labels), '"ready"') ||
github.event.action == 'auto_merge_enabled'
)
&& fromJSON('[{"vllm_version":{"name":"vLLM:main"}}]')
|| fromJSON('[]')
}}
name: "${{ matrix.test_suite.name }} (${{ matrix.vllm_version.name }})"
steps:
- name: "Checkout"
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: "Get changed source files"
id: changed-src-files
uses: tj-actions/changed-files@v46
with: # Avoid using single or double quotes for multiline patterns
files: |
.github/workflows/test.yml
pyproject.toml
uv.lock
tests/**/*.py
vllm_spyre/**/*.py
- name: "Install PyTorch 2.7.1"
if: steps.changed-src-files.outputs.any_changed == 'true'
run: |
pip install torch=="2.7.1+cpu" --index-url https://download.pytorch.org/whl/cpu
- name: "Install uv"
if: steps.changed-src-files.outputs.any_changed == 'true'
uses: astral-sh/setup-uv@v5
with:
version: "latest"
python-version: ${{ matrix.python_version }}
enable-cache: true
ignore-nothing-to-cache: true
cache-dependency-glob: |
pyproject.toml
- name: "Set vLLM version"
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.vllm_version.repo )
run: |
uv add ${{ matrix.vllm_version.repo }}
echo "TEST_VLLM_VERSION=${{ matrix.vllm_version.name }}" >> "$GITHUB_ENV"
- name: "Install vLLM with Spyre plugin"
if: steps.changed-src-files.outputs.any_changed == 'true'
run: |
uv venv .venv --system-site-packages
source .venv/bin/activate
# Syncs both the runtime and dev deps, based on the lockfile contents
uv sync --frozen
# Builds and installs the vllm_spyre wheel into .venv
# This needs to be done after `uv sync`, or the wheel install will be
# overwritten.
uv pip install -v .
- name: "Standardize HF model names for caching"
id: standardize-names
run: |
# replace '/' characters in HF_MODEL with '--' for GHA cache keys and
# in model file names in local HF hub cache
# don't use in-line default values for variable expansion here to not
# use the default model revision with a non-default model like this:
# model="${{ matrix.test_suite.hf_model || env.DEFAULT_HF_MODEL }}"
# revision="${{ matrix.test_suite.hf_model_rev || env.DEFAULT_HF_MODEL_REV }}"
if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
model="${{ matrix.test_suite.hf_model }}"
revision="${{ matrix.test_suite.hf_model_rev }}"
else
model="${{ env.DEFAULT_HF_MODEL }}"
revision="${{ env.DEFAULT_HF_MODEL_REV }}"
fi
safe_name="${model//\//--}"
echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV"
echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV"
if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then
model_2="${{ matrix.test_suite.hf_model_2 }}"
revision_2="${{ matrix.test_suite.hf_model_2_rev}}"
safe_name_2="${model_2//\//--}"
echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV"
echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV"
fi
- name: "Restore HF models cache"
id: cache_restore
if: steps.changed-src-files.outputs.any_changed == 'true'
uses: actions/cache/restore@v4
with:
path: ${{ env.model_path }}
key: ${{ runner.os }}-hf-model-${{ env.model_key }}
- name: "Restore HF models cache for additional model"
id: cache_restore_2
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 )
uses: actions/cache/restore@v4
with:
path: ${{ env.model_2_path }}
key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
- name: "Download HF models"
if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' || steps.cache_restore_2.outputs.cache-hit != 'true'))
run: |
# We are caching HF models (HF_HUB_CACHE) for reliability rather than
# speed, since HF downloads are flaky for concurrent jobs.
# Be careful when adding models to the cache here, as the GHA cache is
# limited to 10 GB.
# If a new model is added here, a new hash key is generated. The
# previous cache blob can then be removed by an admin or can be left
# to expire after 7 days.
if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
model="${{ matrix.test_suite.hf_model }}"
revision="${{ matrix.test_suite.hf_model_rev }}"
else
model="${{ env.DEFAULT_HF_MODEL }}"
revision="${{ env.DEFAULT_HF_MODEL_REV }}"
fi
model_2="${{ matrix.test_suite.hf_model_2 }}"
revision_2="${{ matrix.test_suite.hf_model_2_rev }}"
python3 tools/download_model.py -m "$model" -r "${revision:-main}" &
if [[ -n "$model_2" ]]; then
python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" &
fi
wait
- name: "Save HF models cache"
if: ( steps.changed-src-files.outputs.any_changed == 'true' && github.event_name != 'pull_request' && steps.cache_restore.outputs.cache-hit != 'true' )
uses: actions/cache/save@v4
with:
path: ${{ env.model_path }}
key: ${{ runner.os }}-hf-model-${{ env.model_key }}
- name: "Save HF models cache for additional model"
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' )
uses: actions/cache/save@v4
with:
path: ${{ env.model_2_path }}
key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
- name: "Run tests"
if: steps.changed-src-files.outputs.any_changed == 'true'
env:
MASTER_PORT: 12355
MASTER_ADDR: localhost
DISTRIBUTED_STRATEGY_IGNORE_MODULES: WordEmbedding
VLLM_SPYRE_TEST_MODEL_LIST: "${{ matrix.test_suite.name == 'static batching' && 'JackFram/llama-160m' || '' }}"
HF_HUB_OFFLINE: 1
run: |
# Delete the source code so we can ensure we're testing the installed
# wheel
rm -fr vllm_spyre
# We activate .venv manually and run pytest directly instead of using
# `uv run`, to avoid having `uv run` re-sync any dependencies or
# re-install the vllm_sypre package from source
source .venv/bin/activate
python3 -m pytest ${{ matrix.test_suite.flags }} \
tests -v -m "${{ matrix.test_suite.markers }}"