Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 71 additions & 58 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ env:
VLLM_PLUGINS: "spyre"
HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"
DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
Expand All @@ -44,19 +45,22 @@ jobs:
- name: "static batching"
markers: "cpu and decoder and not cb and not other_e2e and not quantized"
flags: "--timeout=300"
hf_models: "JackFram/llama-160m"
hf_model: "JackFram/llama-160m"
- name: "fp8"
markers: "cpu and quantized and multi"
flags: "--timeout=600 -k 'basic and test_output' --durations=0"
hf_models: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
hf_model_rev: "0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e"
- name: "embedding"
markers: "cpu and embedding and not quantized"
flags: "--timeout=300"
hf_models: "sentence-transformers/all-roberta-large-v1"
hf_model: "sentence-transformers/all-roberta-large-v1"
hf_model_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
- name: "scoring"
markers: "cpu and scoring"
flags: "--timeout=300"
hf_models: "cross-encoder/stsb-roberta-large"
hf_model: "cross-encoder/stsb-roberta-large"
hf_model_rev: "2b12c2c0088918e76151fd5937b7bba986ef1f98"
- name: "continuous batching"
markers: "cpu and cb and not quantized"
flags: "--timeout=300 --durations=0 -s"
Expand All @@ -80,7 +84,8 @@ jobs:
name: "backward compat"
markers: "compat or (cpu and basic)"
flags: "--timeout=300"
hf_models: "micro-g3.3_roberta-large"
hf_model_2: "sentence-transformers/all-roberta-large-v1"
hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
os: "ubuntu-latest"
python_version: "3.12"
# Exclude vLLM:main if PR does NOT have "ready" label AND auto-merge is not enabled
Expand Down Expand Up @@ -150,19 +155,34 @@ jobs:
# overwritten.
uv pip install -v .

# Standardize model name for cache keys
- name: Standardize HF model name
- name: "Standardize HF model names for caching"
id: standardize-names
run: |
model="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}"
if [[ "$model" == "micro-g3.3_roberta-large" ]]; then
echo "model_key=micro-g3.3_roberta-large" >> "$GITHUB_ENV"
echo "model_path=${HF_HUB_CACHE}" >> "$GITHUB_ENV"
# replace '/' characters in HF_MODEL with '--' for GHA cache keys and
# in model file names in local HF hub cache
Comment on lines +161 to +162
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A comment on how this looks like would be nice, like what was it earlier and what it looks like after replacing


# don't use in-line default values for variable expansion here to not
# use the default model revision with a non-default model like this:
# model="${{ matrix.test_suite.hf_model || env.DEFAULT_HF_MODEL }}"
# revision="${{ matrix.test_suite.hf_model_rev || env.DEFAULT_HF_MODEL_REV }}"

if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
model="${{ matrix.test_suite.hf_model }}"
revision="${{ matrix.test_suite.hf_model_rev }}"
else
# replace / with --
safe_name="${model//\//--}"
echo "model_key=$safe_name" >> "$GITHUB_ENV"
echo "model_path=${HF_HUB_CACHE}/models--$safe_name" >> "$GITHUB_ENV"
model="${{ env.DEFAULT_HF_MODEL }}"
revision="${{ env.DEFAULT_HF_MODEL_REV }}"
fi
safe_name="${model//\//--}"
echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV"
echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV"

if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then
model_2="${{ matrix.test_suite.hf_model_2 }}"
revision_2="${{ matrix.test_suite.hf_model_2_rev}}"
safe_name_2="${model_2//\//--}"
echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV"
echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV"
fi

- name: "Restore HF models cache"
Expand All @@ -173,54 +193,40 @@ jobs:
path: ${{ env.model_path }}
key: ${{ runner.os }}-hf-model-${{ env.model_key }}

- name: "Restore HF models cache for additional model"
id: cache_restore_2
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 )
uses: actions/cache/restore@v4
with:
path: ${{ env.model_2_path }}
key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}

- name: "Download HF models"
if: ( steps.changed-src-files.outputs.any_changed == 'true' && steps.cache_restore.outputs.cache-hit != 'true' )
if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' || steps.cache_restore_2.outputs.cache-hit != 'true'))
run: |
# We are caching HF models (HF_HUB_CACHE) for reliability rather than speed, since HF downloads are flaky for concurrent jobs.
# Be careful when adding models to the cache here, as the GHA cache is limited to 10 GB.
# If a new model is added here, a new hash key is generated. The previous cache blob can then
# be removed by an admin or can be left to expire after 7 days.

download_tinygranite() {
python -c "from transformers import pipeline, AutoTokenizer; pipeline('text-generation', model='$1'); tokenizer=AutoTokenizer.from_pretrained('$1')"
}
download_roberta_large() {
python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('$1')"
}
# tinyllama used for static batching tests because static batching is _too slow_
download_tinyllama() {
python -c "from transformers import pipeline; pipeline('text-generation', model='$1')"
}

hf_models="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}"

if [[ "$hf_models" == "micro-g3.3_roberta-large" ]]; then
models=("ibm-ai-platform/micro-g3.3-8b-instruct-1b" "sentence-transformers/all-roberta-large-v1")
# We are caching HF models (HF_HUB_CACHE) for reliability rather than
# speed, since HF downloads are flaky for concurrent jobs.
# Be careful when adding models to the cache here, as the GHA cache is
# limited to 10 GB.
# If a new model is added here, a new hash key is generated. The
# previous cache blob can then be removed by an admin or can be left
# to expire after 7 days.

if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
model="${{ matrix.test_suite.hf_model }}"
revision="${{ matrix.test_suite.hf_model_rev }}"
else
models=("$hf_models")
model="${{ env.DEFAULT_HF_MODEL }}"
revision="${{ env.DEFAULT_HF_MODEL_REV }}"
fi
model_2="${{ matrix.test_suite.hf_model_2 }}"
revision_2="${{ matrix.test_suite.hf_model_2_rev }}"

for model in "${models[@]}"; do
echo "Downloading $model ..."
case "$model" in
"ibm-ai-platform/micro-g3.3-8b-instruct-1b"*)
download_tinygranite "$model" &
;;
"JackFram/llama-160m")
download_tinyllama "$model" &
;;
"sentence-transformers/all-roberta-large-v1")
download_roberta_large "$model" &
;;
"cross-encoder/stsb-roberta-large")
download_roberta_large "$model" &
;;
*)
echo "No download method found for: $model";
exit 1
;;
esac
done
python3 tools/download_model.py -m "$model" -r "${revision:-main}" &

if [[ -n "$model_2" ]]; then
python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" &
fi

wait

Expand All @@ -231,6 +237,13 @@ jobs:
path: ${{ env.model_path }}
key: ${{ runner.os }}-hf-model-${{ env.model_key }}

- name: "Save HF models cache for additional model"
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' )
uses: actions/cache/save@v4
with:
path: ${{ env.model_2_path }}
key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}

- name: "Run tests"
if: steps.changed-src-files.outputs.any_changed == 'true'
env:
Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/test_spyre_prompt_logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ def test_prompt_logprobs_not_supported_with_cb(
monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1")

with pytest.raises(ValueError, match="continuous batching"):
VllmConfig(model_config=ModelConfig(model=model.name, task="generate"))
VllmConfig(model_config=ModelConfig(
model=model.name, revision=model.revision, task="generate"))


@pytest.mark.skip
Expand Down Expand Up @@ -137,7 +138,8 @@ def _get_hf_prompt_logprobs(model_info: ModelInfo, prompts,
for each token"""
tokenizer = AutoTokenizer.from_pretrained(model_info.name,
revision=model_info.revision)
model = AutoModelForCausalLM.from_pretrained(model_info.name)
model = AutoModelForCausalLM.from_pretrained(model_info.name,
revision=model_info.revision)

prompt_logprobs = {}
for prompt in prompts:
Expand Down
63 changes: 63 additions & 0 deletions tools/download_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python3
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this tool required if you can use huggingface-cli download? Is it to make sure that only the necessary and sufficient set of files is downloaded?

Copy link
Collaborator Author

@ckadner ckadner Oct 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are using it for the GitHub Action workflow to download models with revisions (unless they are already cached in a GHA cache blob).

#499 (comment)

I recall being told and seeing it in comments, that the HuggingFace CLI is not reliable during Github action runs, though I have never put that to the test myself.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth a shot? I also seem to remember that the HF CLI was downloading something weird at one point, but I don't see that lately. Maybe something got fixed?

Copy link
Collaborator Author

@ckadner ckadner Oct 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I recall being told and seeing it in comments, that the HuggingFace CLI is not reliable during Github action runs, though I have never put that to the test myself

Worth a shot? I also seem to remember that the HF CLI was downloading something weird at one point, but I don't see that lately. Maybe something got fixed?

I tried using the hf download action and it failed in 3 of 10 test jobs:

https://github.com/ckadner/vllm-spyre/actions/runs/18510700226/job/52750674898?pr=20

 Traceback (most recent call last):
  File "/home/runner/work/vllm-spyre/vllm-spyre/.venv/bin/huggingface-cli", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/home/runner/work/vllm-spyre/vllm-spyre/.venv/lib/python3.12/site-packages/huggingface_hub/commands/huggingface_cli.py", line 61, in main

Copy link
Collaborator Author

@ckadner ckadner Oct 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, System.IO.IOException: No space left on device.

[EDIT]
I wonder if HF CLI download (temporarily) creates/keeps 2 copies of the files while the download is ongoing which exceeds the available disk space on the GHA runner?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmmm

Copy link
Collaborator Author

@ckadner ckadner Oct 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more time with hf download --max-workers 2 ...

Downloading 'pytorch_model.bin' to '/home/runner/work/vllm-spyre/vllm-spyre/.cache/huggingface/hub/models--cross-encoder--stsb-roberta-large/blobs/03023f7dcd714c15ff27d534432a80d3bff78c9b50778a44b10585ef5fa7fd25.incomplete'
/home/runner/work/vllm-spyre/vllm-spyre/.venv/lib/python3.12/site-packages/huggingface_hub/
file_download.py:801: UserWarning: Not enough free disk space to download the file. The expected file size is: 1421.62 MB. The target location /home/runner/work/vllm-spyre/vllm-spyre/.cache/huggingface/hub/models--cross-encoder--stsb-roberta-large/blobs only has 1414.68 MB free disk space.

Copy link
Collaborator Author

@ckadner ckadner Oct 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could probably figure out how the space gets used, which files to exclude and/or how to make more space on the GHA runner.

But we already have the download script and the code in it has been running fine for several months. So, I vote for keeping the existing custom download code, albeit in a separate script now.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, often repositories have extra files such as model weights in other formats. I think the script only downloads the required files, so that would explain that it doesn't run out of disk space. I would prefer less code to maintain, but since we have these space restrictions I guess for now it's better to keep the script.

"""Download a model from HuggingFace with revision.
> python3 tools/download_model.py -m <HF-model-id> [-r <git-tag-or-hash>]
"""

import argparse
import logging


def download_granite_or_llama(model: str, revision: str = "main"):
from transformers import pipeline
pipeline('text-generation', model=model, revision=revision)


def download_roberta(model: str, revision: str = "main"):
from sentence_transformers import SentenceTransformer
SentenceTransformer(model, revision=revision)


download_methods = {
"ibm-ai-platform/micro-g3.3-8b-instruct-1b": download_granite_or_llama,
"ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8": download_granite_or_llama,
"JackFram/llama-160m": download_granite_or_llama,
"cross-encoder/stsb-roberta-large": download_roberta,
"sentence-transformers/all-roberta-large-v1": download_roberta,
}


def download_model_with_revision(model: str, revision: str = "main"):
if model in download_methods:
download_method = download_methods.get(model)
logging.info("Downloading model '%s' with revision '%s' ...", model,
revision)
download_method(model, revision)
logging.info("Model '%s' with revision '%s' downloaded.", model,
revision)
else:
logging.error(
"No `download_method` found for model '%s'."
" Supported models: %s", model, str(list(download_methods.keys())))
exit(1)


def main():
parser = argparse.ArgumentParser()
parser.add_argument("-m", dest="model", help="HuggingFace model ID")
parser.add_argument("-r",
dest="revision",
default="main",
help="Git hash, tag, or branch (default='main')")
args, _extra_args = parser.parse_known_args()

if args.model:
download_model_with_revision(args.model, args.revision)
else:
logging.error("Need to provide a HuggingFace model ID.")
exit(1)


if __name__ == '__main__':
main()
Loading