Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 71 additions & 58 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ env:
VLLM_PLUGINS: "spyre"
HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"
DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
Expand All @@ -44,19 +45,22 @@ jobs:
- name: "static batching"
markers: "cpu and decoder and not cb and not other_e2e and not quantized"
flags: "--timeout=300"
hf_models: "JackFram/llama-160m"
hf_model: "JackFram/llama-160m"
- name: "fp8"
markers: "cpu and quantized and multi"
flags: "--timeout=600 -k 'basic and test_output' --durations=0"
hf_models: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
hf_model_rev: "0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e"
- name: "embedding"
markers: "cpu and embedding and not quantized"
flags: "--timeout=300"
hf_models: "sentence-transformers/all-roberta-large-v1"
hf_model: "sentence-transformers/all-roberta-large-v1"
hf_model_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
- name: "scoring"
markers: "cpu and scoring"
flags: "--timeout=300"
hf_models: "cross-encoder/stsb-roberta-large"
hf_model: "cross-encoder/stsb-roberta-large"
hf_model_rev: "2b12c2c0088918e76151fd5937b7bba986ef1f98"
- name: "continuous batching"
markers: "cpu and cb and not quantized"
flags: "--timeout=300 --durations=0 -s"
Expand All @@ -80,7 +84,8 @@ jobs:
name: "backward compat"
markers: "compat or (cpu and basic)"
flags: "--timeout=300"
hf_models: "micro-g3.3_roberta-large"
hf_model_2: "sentence-transformers/all-roberta-large-v1"
hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
os: "ubuntu-latest"
python_version: "3.12"
# Exclude vLLM:main if PR does NOT have "ready" label AND auto-merge is not enabled
Expand Down Expand Up @@ -150,19 +155,34 @@ jobs:
# overwritten.
uv pip install -v .

# Standardize model name for cache keys
- name: Standardize HF model name
- name: "Standardize HF model names for caching"
id: standardize-names
run: |
model="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}"
if [[ "$model" == "micro-g3.3_roberta-large" ]]; then
echo "model_key=micro-g3.3_roberta-large" >> "$GITHUB_ENV"
echo "model_path=${HF_HUB_CACHE}" >> "$GITHUB_ENV"
# replace '/' characters in HF_MODEL with '--' for GHA cache keys and
# in model file names in local HF hub cache

# don't use in-line default values for variable expansion here to not
# use the default model revision with a non-default model like this:
# model="${{ matrix.test_suite.hf_model || env.DEFAULT_HF_MODEL }}"
# revision="${{ matrix.test_suite.hf_model_rev || env.DEFAULT_HF_MODEL_REV }}"

if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
model="${{ matrix.test_suite.hf_model }}"
revision="${{ matrix.test_suite.hf_model_rev }}"
else
# replace / with --
safe_name="${model//\//--}"
echo "model_key=$safe_name" >> "$GITHUB_ENV"
echo "model_path=${HF_HUB_CACHE}/models--$safe_name" >> "$GITHUB_ENV"
model="${{ env.DEFAULT_HF_MODEL }}"
revision="${{ env.DEFAULT_HF_MODEL_REV }}"
fi
safe_name="${model//\//--}"
echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV"
echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV"

if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then
model_2="${{ matrix.test_suite.hf_model_2 }}"
revision_2="${{ matrix.test_suite.hf_model_2_rev}}"
safe_name_2="${model_2//\//--}"
echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV"
echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV"
fi

- name: "Restore HF models cache"
Expand All @@ -173,54 +193,40 @@ jobs:
path: ${{ env.model_path }}
key: ${{ runner.os }}-hf-model-${{ env.model_key }}

- name: "Restore HF models cache for additional model"
id: cache_restore_2
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 )
uses: actions/cache/restore@v4
with:
path: ${{ env.model_2_path }}
key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}

- name: "Download HF models"
if: ( steps.changed-src-files.outputs.any_changed == 'true' && steps.cache_restore.outputs.cache-hit != 'true' )
if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' || steps.cache_restore_2.outputs.cache-hit != 'true'))
run: |
# We are caching HF models (HF_HUB_CACHE) for reliability rather than speed, since HF downloads are flaky for concurrent jobs.
# Be careful when adding models to the cache here, as the GHA cache is limited to 10 GB.
# If a new model is added here, a new hash key is generated. The previous cache blob can then
# be removed by an admin or can be left to expire after 7 days.

download_tinygranite() {
python -c "from transformers import pipeline, AutoTokenizer; pipeline('text-generation', model='$1'); tokenizer=AutoTokenizer.from_pretrained('$1')"
}
download_roberta_large() {
python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('$1')"
}
# tinyllama used for static batching tests because static batching is _too slow_
download_tinyllama() {
python -c "from transformers import pipeline; pipeline('text-generation', model='$1')"
}

hf_models="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}"

if [[ "$hf_models" == "micro-g3.3_roberta-large" ]]; then
models=("ibm-ai-platform/micro-g3.3-8b-instruct-1b" "sentence-transformers/all-roberta-large-v1")
# We are caching HF models (HF_HUB_CACHE) for reliability rather than
# speed, since HF downloads are flaky for concurrent jobs.
# Be careful when adding models to the cache here, as the GHA cache is
# limited to 10 GB.
# If a new model is added here, a new hash key is generated. The
# previous cache blob can then be removed by an admin or can be left
# to expire after 7 days.

if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
model="${{ matrix.test_suite.hf_model }}"
revision="${{ matrix.test_suite.hf_model_rev }}"
else
models=("$hf_models")
model="${{ env.DEFAULT_HF_MODEL }}"
revision="${{ env.DEFAULT_HF_MODEL_REV }}"
fi
model_2="${{ matrix.test_suite.hf_model_2 }}"
revision_2="${{ matrix.test_suite.hf_model_2_rev }}"

for model in "${models[@]}"; do
echo "Downloading $model ..."
case "$model" in
"ibm-ai-platform/micro-g3.3-8b-instruct-1b"*)
download_tinygranite "$model" &
;;
"JackFram/llama-160m")
download_tinyllama "$model" &
;;
"sentence-transformers/all-roberta-large-v1")
download_roberta_large "$model" &
;;
"cross-encoder/stsb-roberta-large")
download_roberta_large "$model" &
;;
*)
echo "No download method found for: $model";
exit 1
;;
esac
done
python3 tools/download_model.py -m "$model" -r "${revision:-main}" &

if [[ -n "$model_2" ]]; then
python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" &
fi

wait

Expand All @@ -231,6 +237,13 @@ jobs:
path: ${{ env.model_path }}
key: ${{ runner.os }}-hf-model-${{ env.model_key }}

- name: "Save HF models cache for additional model"
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' )
uses: actions/cache/save@v4
with:
path: ${{ env.model_2_path }}
key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}

- name: "Run tests"
if: steps.changed-src-files.outputs.any_changed == 'true'
env:
Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/test_spyre_prompt_logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ def test_prompt_logprobs_not_supported_with_cb(
monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1")

with pytest.raises(ValueError, match="continuous batching"):
VllmConfig(model_config=ModelConfig(model=model.name, task="generate"))
VllmConfig(model_config=ModelConfig(
model=model.name, revision=model.revision, task="generate"))


@pytest.mark.skip
Expand Down Expand Up @@ -137,7 +138,8 @@ def _get_hf_prompt_logprobs(model_info: ModelInfo, prompts,
for each token"""
tokenizer = AutoTokenizer.from_pretrained(model_info.name,
revision=model_info.revision)
model = AutoModelForCausalLM.from_pretrained(model_info.name)
model = AutoModelForCausalLM.from_pretrained(model_info.name,
revision=model_info.revision)

prompt_logprobs = {}
for prompt in prompts:
Expand Down
4 changes: 3 additions & 1 deletion tests/golden_token_injector.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device,
# for couple requests that does not have too much impact.
# But since this is used mostly for validation, it would be fine
# to keep them.
self.tokenizer = get_tokenizer(vllm_config.model_config.tokenizer)
self.tokenizer = get_tokenizer(
vllm_config.model_config.tokenizer,
revision=vllm_config.model_config.revision)

def is_argmax_invariant(self) -> bool:
"""Never impacts greedy sampling"""
Expand Down
63 changes: 63 additions & 0 deletions tools/download_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python3
"""Download a model from HuggingFace with revision.

> python3 tools/download_model.py -m <HF-model-id> [-r <git-tag-or-hash>]

"""

import argparse
import logging


def download_granite_or_llama(model: str, revision: str = "main"):
from transformers import pipeline
pipeline('text-generation', model=model, revision=revision)


def download_roberta(model: str, revision: str = "main"):
from sentence_transformers import SentenceTransformer
SentenceTransformer(model, revision=revision)


download_methods = {
"ibm-ai-platform/micro-g3.3-8b-instruct-1b": download_granite_or_llama,
"ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8": download_granite_or_llama,
"JackFram/llama-160m": download_granite_or_llama,
"cross-encoder/stsb-roberta-large": download_roberta,
"sentence-transformers/all-roberta-large-v1": download_roberta,
}


def download_model_with_revision(model: str, revision: str = "main"):
if model in download_methods:
download_method = download_methods.get(model)
logging.info("Downloading model '%s' with revision '%s' ...", model,
revision)
download_method(model, revision)
logging.info("Model '%s' with revision '%s' downloaded.", model,
revision)
else:
logging.error(
"No `download_method` found for model '%s'."
" Supported models: %s", model, str(list(download_methods.keys())))
exit(1)


def main():
parser = argparse.ArgumentParser()
parser.add_argument("-m", dest="model", help="HuggingFace model ID")
parser.add_argument("-r",
dest="revision",
default="main",
help="Git hash, tag, or branch (default='main')")
args, _extra_args = parser.parse_known_args()

if args.model:
download_model_with_revision(args.model, args.revision)
else:
logging.error("Need to provide a HuggingFace model ID.")
exit(1)


if __name__ == '__main__':
main()