Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 130 additions & 25 deletions .github/workflows/test-spyre.yml
Original file line number Diff line number Diff line change
@@ -1,29 +1,134 @@
name: test-sypre
name: Test

on: pull_request
on:
# Don't use pull_request.paths filter since this workflow is required for
# all pull requests on main irrespective of file type or location
pull_request:
branches:
- main
push:
branches:
- main
paths:
- "tests/**/*.py"
- "vllm_spyre/**/*.py"
- pyproject.toml
- .github/workflows/test-spyre.yml
workflow_dispatch:

env:
FORCE_COLOR: "1"
VLLM_CPU_DISABLE_AVX512: "true"
VLLM_TARGET_DEVICE: "empty"
VLLM_PLUGINS: "spyre"
VLLM_SPYRE_TEST_MODEL_DIR: "${{ github.workspace }}/models"
HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
test-spyre:
runs-on: ubuntu-latest
test:
timeout-minutes: 20
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: ["ubuntu-latest"]
python_version: ["3.12"]
vllm_version:
- name: "default"
repo: ""
- name: "vLLM:main"
repo: "git+https://github.com/vllm-project/vllm --branch main"
test_suite:
- name: "V0"
tests: "V0 and eager"
flags: "--timeout=300"
- name: "V1"
tests: "(V1- and eager) or test_sampling_metadata_in_input_batch"
flags: "--timeout=300 --forked"

name: "${{ matrix.test_suite.name }} (${{ matrix.vllm_version.name }})"

steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Build docker image
run: docker build . -t vllm-spyre -f Dockerfile.spyre
- name: Run Spyre tests within docker container
run: |
docker run -i --rm --entrypoint /bin/bash vllm-spyre -c '''
source vllm-spyre/.venv/bin/activate && \
python -c "from transformers import pipeline; pipeline(\"text-generation\", model=\"JackFram/llama-160m\")" && \
export VARIANT=$(ls /root/.cache/huggingface/hub/models--JackFram--llama-160m/snapshots/) && \
mkdir -p /models && \
ln -s /root/.cache/huggingface/hub/models--JackFram--llama-160m/snapshots/${VARIANT} /models/llama-194m && \
python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer(\"sentence-transformers/all-roberta-large-v1\")" && \
export VARIANT=$(ls /root/.cache/huggingface/hub/models--sentence-transformers--all-roberta-large-v1/snapshots/) && \
ln -s /root/.cache/huggingface/hub/models--sentence-transformers--all-roberta-large-v1/snapshots/${VARIANT} /models/all-roberta-large-v1 && \
export MASTER_PORT=12355 && \
export MASTER_ADDR=localhost && \
export DISTRIBUTED_STRATEGY_IGNORE_MODULES=WordEmbedding && \
cd vllm-spyre && \
python -m pytest --timeout=300 tests -v -k "V0 and eager" && \
python -m pytest --forked --timeout=300 tests -v -k "(V1- and eager) or test_sampling_metadata_in_input_batch"
'''
- name: "Checkout"
uses: actions/checkout@v4
with:
fetch-depth: 1

- name: "Install PyTorch"
run: |
pip install torch=="2.5.1+cpu" --index-url https://download.pytorch.org/whl/cpu

- name: "Install uv"
uses: astral-sh/setup-uv@v5
with:
version: "latest"
python-version: ${{ matrix.python_version }}
enable-cache: true
ignore-nothing-to-cache: true
cache-dependency-glob: |
pyproject.toml

- name: "Set vLLM version"
if: matrix.vllm_version.repo
run: |
uv add ${{ matrix.vllm_version.repo }}

- name: "Install vLLM with Spyre plugin"
run: |
uv venv .venv --system-site-packages
source .venv/bin/activate
uv pip install -v .
uv sync --frozen --group dev

- name: "Restore HF models cache"
uses: actions/cache/restore@v4
with:
path: ${{ env.HF_HUB_CACHE }}
key: ${{ runner.os }}-hub-cache-${{ hashFiles('cached_models.txt') }}
restore-keys: |
${{ runner.os }}-hub-cache

- name: "Download HF models"
run: |
mkdir -p "${VLLM_SPYRE_TEST_MODEL_DIR}"

# We are caching HF models (HF_HUB_CACHE) for reliability rather than speed, since HF downloads are flaky for concurrent jobs.
# Be careful when adding models to the cache here, as the GHA cache is limited to 10 GB.
# If a new model is added here, hashFiles('cached_models.txt') should create a new hash key. The previous cache blob can then
# be removed by an admin or can be left to expire after 7 days.

download_jackfram_llama() {
python -c "from transformers import pipeline; pipeline('text-generation', model='JackFram/llama-160m')"
VARIANT=$(ls "${HF_HUB_CACHE}/models--JackFram--llama-160m/snapshots/")
ln -s "${HF_HUB_CACHE}/models--JackFram--llama-160m/snapshots/${VARIANT}" "${VLLM_SPYRE_TEST_MODEL_DIR}/llama-194m"
}
download_roberta_large() {
python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('sentence-transformers/all-roberta-large-v1')"
VARIANT=$(ls "${HF_HUB_CACHE}/models--sentence-transformers--all-roberta-large-v1/snapshots/")
ln -s "${HF_HUB_CACHE}/models--sentence-transformers--all-roberta-large-v1/snapshots/${VARIANT}" "${VLLM_SPYRE_TEST_MODEL_DIR}/all-roberta-large-v1"
}
download_jackfram_llama &
download_roberta_large &
wait
ls "${VLLM_SPYRE_TEST_MODEL_DIR}" > cached_models.txt

- name: "Save HF models cache"
if: ( github.event_name != 'pull_request' && strategy.job-index == 0 )
uses: actions/cache/save@v4
with:
path: ${{ env.HF_HUB_CACHE }}
key: ${{ runner.os }}-hub-cache-${{ hashFiles('cached_models.txt') }}

- name: "Run tests"
env:
MASTER_PORT: 12355
MASTER_ADDR: localhost
DISTRIBUTED_STRATEGY_IGNORE_MODULES: WordEmbedding
run: |
source .venv/bin/activate
uv run pytest ${{ matrix.test_suite.flags }} \
tests -v -k "${{ matrix.test_suite.tests }}"
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ use_parentheses = true
skip_gitignore = true

[tool.pytest.ini_options]
pythonpath = ["."]
markers = [
"skip_global_cleanup",
"core_model: enable this model test in each PR instead of only nightly",
Expand Down
Loading