[CI] Enable model revisions in GHA test #2124
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Test | |
on: | |
# Don't use `paths` or `paths-ignore` filter since this workflow is required | |
# for all pull requests on main irrespective of file type or location | |
# Use `changed-src-files` step to determine if source code was changed | |
pull_request: | |
# add labeled and unlabeled to the default types (runs when label is added) | |
types: [opened, synchronize, reopened, labeled, unlabeled, auto_merge_enabled] | |
branches: [main] | |
push: | |
branches: [main] | |
workflow_dispatch: | |
env: | |
FORCE_COLOR: "1" | |
VLLM_CPU_DISABLE_AVX512: "true" | |
VLLM_TARGET_DEVICE: "empty" | |
VLLM_PLUGINS: "spyre" | |
HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub" | |
DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b" | |
DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f" | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | |
cancel-in-progress: true | |
jobs: | |
test: | |
timeout-minutes: 20 | |
runs-on: ${{ matrix.os }} | |
strategy: | |
fail-fast: false | |
matrix: | |
os: ["ubuntu-latest"] | |
python_version: ["3.12"] | |
vllm_version: | |
- name: "default" | |
repo: "" | |
- name: "vLLM:main" | |
repo: "git+https://github.com/vllm-project/vllm --branch main" | |
test_suite: | |
- name: "static batching" | |
markers: "cpu and decoder and not cb and not other_e2e and not quantized" | |
flags: "--timeout=300" | |
hf_model: "JackFram/llama-160m" | |
- name: "fp8" | |
markers: "cpu and quantized and multi" | |
flags: "--timeout=600 -k 'basic and test_output' --durations=0" | |
hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8" | |
hf_model_rev: "0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e" | |
- name: "embedding" | |
markers: "cpu and embedding and not quantized" | |
flags: "--timeout=300" | |
hf_model: "sentence-transformers/all-roberta-large-v1" | |
hf_model_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" | |
- name: "scoring" | |
markers: "cpu and scoring" | |
flags: "--timeout=300" | |
hf_model: "cross-encoder/stsb-roberta-large" | |
hf_model_rev: "2b12c2c0088918e76151fd5937b7bba986ef1f98" | |
- name: "continuous batching" | |
markers: "cpu and cb and not quantized" | |
flags: "--timeout=300 --durations=0 -s" | |
- name: "worker and utils" | |
markers: "not e2e and not quantized" | |
flags: "--timeout=300" | |
- name: "compatibility" | |
markers: "compat" | |
flags: "--timeout=300" | |
- name: "other e2e" | |
markers: "cpu and other_e2e and not quantized" | |
flags: "--timeout=300" | |
- name: "precompilation" | |
markers: "precompilation and not quantized" | |
flags: "--timeout=300" | |
include: | |
- vllm_version: | |
name: "vLLM:lowest" | |
repo: "git+https://github.com/vllm-project/vllm --tag v0.10.2" | |
test_suite: | |
name: "backward compat" | |
markers: "compat or (cpu and basic)" | |
flags: "--timeout=300" | |
hf_model_2: "sentence-transformers/all-roberta-large-v1" | |
hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" | |
os: "ubuntu-latest" | |
python_version: "3.12" | |
# Exclude vLLM:main if PR does NOT have "ready" label AND auto-merge is not enabled | |
exclude: >- | |
${{ | |
github.event_name == 'pull_request' && | |
!( | |
contains(toJson(github.event.pull_request.labels), '"ready"') || | |
github.event.action == 'auto_merge_enabled' | |
) | |
&& fromJSON('[{"vllm_version":{"name":"vLLM:main"}}]') | |
|| fromJSON('[]') | |
}} | |
name: "${{ matrix.test_suite.name }} (${{ matrix.vllm_version.name }})" | |
steps: | |
- name: "Checkout" | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 1 | |
- name: "Get changed source files" | |
id: changed-src-files | |
uses: tj-actions/changed-files@v46 | |
with: # Avoid using single or double quotes for multiline patterns | |
files: | | |
.github/workflows/test.yml | |
pyproject.toml | |
uv.lock | |
tests/**/*.py | |
vllm_spyre/**/*.py | |
- name: "Install PyTorch 2.7.1" | |
if: steps.changed-src-files.outputs.any_changed == 'true' | |
run: | | |
pip install torch=="2.7.1+cpu" --index-url https://download.pytorch.org/whl/cpu | |
- name: "Install uv" | |
if: steps.changed-src-files.outputs.any_changed == 'true' | |
uses: astral-sh/setup-uv@v5 | |
with: | |
version: "latest" | |
python-version: ${{ matrix.python_version }} | |
enable-cache: true | |
ignore-nothing-to-cache: true | |
cache-dependency-glob: | | |
pyproject.toml | |
- name: "Set vLLM version" | |
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.vllm_version.repo ) | |
run: | | |
uv add ${{ matrix.vllm_version.repo }} | |
echo "TEST_VLLM_VERSION=${{ matrix.vllm_version.name }}" >> "$GITHUB_ENV" | |
- name: "Install vLLM with Spyre plugin" | |
if: steps.changed-src-files.outputs.any_changed == 'true' | |
run: | | |
uv venv .venv --system-site-packages | |
source .venv/bin/activate | |
# Syncs both the runtime and dev deps, based on the lockfile contents | |
uv sync --frozen | |
# Builds and installs the vllm_spyre wheel into .venv | |
# This needs to be done after `uv sync`, or the wheel install will be | |
# overwritten. | |
uv pip install -v . | |
- name: "Standardize HF model names for caching" | |
id: standardize-names | |
run: | | |
# replace '/' characters in HF_MODEL with '--' for GHA cache keys and | |
# in model file names in local HF hub cache | |
# don't use in-line default values for variable expansion here to not | |
# use the default model revision with a non-default model like this: | |
# model="${{ matrix.test_suite.hf_model || env.DEFAULT_HF_MODEL }}" | |
# revision="${{ matrix.test_suite.hf_model_rev || env.DEFAULT_HF_MODEL_REV }}" | |
if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then | |
model="${{ matrix.test_suite.hf_model }}" | |
revision="${{ matrix.test_suite.hf_model_rev }}" | |
else | |
model="${{ env.DEFAULT_HF_MODEL }}" | |
revision="${{ env.DEFAULT_HF_MODEL_REV }}" | |
fi | |
safe_name="${model//\//--}" | |
echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV" | |
echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV" | |
if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then | |
model_2="${{ matrix.test_suite.hf_model_2 }}" | |
revision_2="${{ matrix.test_suite.hf_model_2_rev}}" | |
safe_name_2="${model_2//\//--}" | |
echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV" | |
echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV" | |
fi | |
- name: "Restore HF models cache" | |
id: cache_restore | |
if: steps.changed-src-files.outputs.any_changed == 'true' | |
uses: actions/cache/restore@v4 | |
with: | |
path: ${{ env.model_path }} | |
key: ${{ runner.os }}-hf-model-${{ env.model_key }} | |
- name: "Restore HF models cache for additional model" | |
id: cache_restore_2 | |
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 ) | |
uses: actions/cache/restore@v4 | |
with: | |
path: ${{ env.model_2_path }} | |
key: ${{ runner.os }}-hf-model-${{ env.model_2_key }} | |
- name: "Download HF models" | |
if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' || steps.cache_restore_2.outputs.cache-hit != 'true')) | |
run: | | |
# We are caching HF models (HF_HUB_CACHE) for reliability rather than | |
# speed, since HF downloads are flaky for concurrent jobs. | |
# Be careful when adding models to the cache here, as the GHA cache is | |
# limited to 10 GB. | |
# If a new model is added here, a new hash key is generated. The | |
# previous cache blob can then be removed by an admin or can be left | |
# to expire after 7 days. | |
if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then | |
model="${{ matrix.test_suite.hf_model }}" | |
revision="${{ matrix.test_suite.hf_model_rev }}" | |
else | |
model="${{ env.DEFAULT_HF_MODEL }}" | |
revision="${{ env.DEFAULT_HF_MODEL_REV }}" | |
fi | |
model_2="${{ matrix.test_suite.hf_model_2 }}" | |
revision_2="${{ matrix.test_suite.hf_model_2_rev }}" | |
python3 tools/download_model.py -m "$model" -r "${revision:-main}" & | |
if [[ -n "$model_2" ]]; then | |
python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" & | |
fi | |
wait | |
- name: "Save HF models cache" | |
if: ( steps.changed-src-files.outputs.any_changed == 'true' && github.event_name != 'pull_request' && steps.cache_restore.outputs.cache-hit != 'true' ) | |
uses: actions/cache/save@v4 | |
with: | |
path: ${{ env.model_path }} | |
key: ${{ runner.os }}-hf-model-${{ env.model_key }} | |
- name: "Save HF models cache for additional model" | |
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' ) | |
uses: actions/cache/save@v4 | |
with: | |
path: ${{ env.model_2_path }} | |
key: ${{ runner.os }}-hf-model-${{ env.model_2_key }} | |
- name: "Run tests" | |
if: steps.changed-src-files.outputs.any_changed == 'true' | |
env: | |
MASTER_PORT: 12355 | |
MASTER_ADDR: localhost | |
DISTRIBUTED_STRATEGY_IGNORE_MODULES: WordEmbedding | |
VLLM_SPYRE_TEST_MODEL_LIST: "${{ matrix.test_suite.name == 'static batching' && 'JackFram/llama-160m' || '' }}" | |
HF_HUB_OFFLINE: 1 | |
run: | | |
# Delete the source code so we can ensure we're testing the installed | |
# wheel | |
rm -fr vllm_spyre | |
# We activate .venv manually and run pytest directly instead of using | |
# `uv run`, to avoid having `uv run` re-sync any dependencies or | |
# re-install the vllm_sypre package from source | |
source .venv/bin/activate | |
python3 -m pytest ${{ matrix.test_suite.flags }} \ | |
tests -v -m "${{ matrix.test_suite.markers }}" |