From e8794ed3e516ab31e3dffe6dc53ab955e3b300e9 Mon Sep 17 00:00:00 2001 From: Christian Kadner Date: Mon, 13 Oct 2025 12:40:23 -0700 Subject: [PATCH 1/7] GHA test with model revisions Signed-off-by: Christian Kadner --- .github/workflows/test.yml | 118 +++++++++++++++++++------------------ tools/download_model.py | 63 ++++++++++++++++++++ 2 files changed, 125 insertions(+), 56 deletions(-) create mode 100755 tools/download_model.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8373376b3..f5a3c70f7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,6 +21,8 @@ env: VLLM_PLUGINS: "spyre" HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub" DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b" + DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f" +# DEFAULT_HF_MODEL_REV: "2714578f54cfb744ece40df9326ee0b47e879e03" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -44,19 +46,20 @@ jobs: - name: "static batching" markers: "cpu and decoder and not cb and not other_e2e and not quantized" flags: "--timeout=300" - hf_models: "JackFram/llama-160m" + hf_model: "JackFram/llama-160m" - name: "fp8" markers: "cpu and quantized and multi" flags: "--timeout=600 -k 'basic and test_output' --durations=0" - hf_models: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8" + hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8" + hf_model_rev: "main" - name: "embedding" markers: "cpu and embedding and not quantized" flags: "--timeout=300" - hf_models: "sentence-transformers/all-roberta-large-v1" + hf_model: "sentence-transformers/all-roberta-large-v1" - name: "scoring" markers: "cpu and scoring" flags: "--timeout=300" - hf_models: "cross-encoder/stsb-roberta-large" + hf_model: "cross-encoder/stsb-roberta-large" - name: "continuous batching" markers: "cpu and cb and not quantized" flags: "--timeout=300 --durations=0 -s" @@ -80,7 +83,8 @@ jobs: name: "backward compat" markers: "compat or (cpu and basic)" flags: "--timeout=300" - hf_models: "micro-g3.3_roberta-large" + hf_model_2: "sentence-transformers/all-roberta-large-v1" + hf_model_2_rev: "main" os: "ubuntu-latest" python_version: "3.12" # Exclude vLLM:main if PR does NOT have "ready" label AND auto-merge is not enabled @@ -150,19 +154,28 @@ jobs: # overwritten. uv pip install -v . - # Standardize model name for cache keys - - name: Standardize HF model name + - name: "Standardize HF model names for caching" id: standardize-names run: | - model="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}" - if [[ "$model" == "micro-g3.3_roberta-large" ]]; then - echo "model_key=micro-g3.3_roberta-large" >> "$GITHUB_ENV" - echo "model_path=${HF_HUB_CACHE}" >> "$GITHUB_ENV" + if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then + model="${{ matrix.test_suite.hf_model }}" + revision="${{ matrix.test_suite.hf_model_rev }}" else - # replace / with -- + model="${{ env.DEFAULT_HF_MODEL }}" + revision="${{ env.DEFAULT_HF_MODEL_REV }}" + fi + # replace '/' with '--' safe_name="${model//\//--}" - echo "model_key=$safe_name" >> "$GITHUB_ENV" + echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV" echo "model_path=${HF_HUB_CACHE}/models--$safe_name" >> "$GITHUB_ENV" + + if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then + model_2="${{ matrix.test_suite.hf_model_2 }}" + revision_2="${{ matrix.test_suite.hf_model_2_rev}}" + # replace '/' with '--' + safe_name_2="${model_2//\//--}" + echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV" + echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV" fi - name: "Restore HF models cache" @@ -173,54 +186,40 @@ jobs: path: ${{ env.model_path }} key: ${{ runner.os }}-hf-model-${{ env.model_key }} + - name: "Restore HF models cache for additional model" + id: cache_restore_2 + if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 ) + uses: actions/cache/restore@v4 + with: + path: ${{ env.model_2_path }} + key: ${{ runner.os }}-hf-model-${{ env.model_2_key }} + - name: "Download HF models" - if: ( steps.changed-src-files.outputs.any_changed == 'true' && steps.cache_restore.outputs.cache-hit != 'true' ) + if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' || steps.cache_restore_2.outputs.cache-hit != 'true')) run: | - # We are caching HF models (HF_HUB_CACHE) for reliability rather than speed, since HF downloads are flaky for concurrent jobs. - # Be careful when adding models to the cache here, as the GHA cache is limited to 10 GB. - # If a new model is added here, a new hash key is generated. The previous cache blob can then - # be removed by an admin or can be left to expire after 7 days. - - download_tinygranite() { - python -c "from transformers import pipeline, AutoTokenizer; pipeline('text-generation', model='$1'); tokenizer=AutoTokenizer.from_pretrained('$1')" - } - download_roberta_large() { - python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('$1')" - } - # tinyllama used for static batching tests because static batching is _too slow_ - download_tinyllama() { - python -c "from transformers import pipeline; pipeline('text-generation', model='$1')" - } - - hf_models="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}" - - if [[ "$hf_models" == "micro-g3.3_roberta-large" ]]; then - models=("ibm-ai-platform/micro-g3.3-8b-instruct-1b" "sentence-transformers/all-roberta-large-v1") + # We are caching HF models (HF_HUB_CACHE) for reliability rather than + # speed, since HF downloads are flaky for concurrent jobs. + # Be careful when adding models to the cache here, as the GHA cache is + # limited to 10 GB. + # If a new model is added here, a new hash key is generated. The + # previous cache blob can then be removed by an admin or can be left + # to expire after 7 days. + + if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then + hf_model="${{ matrix.test_suite.hf_model }}" + hf_revision="${{ matrix.test_suite.hf_model_rev }}" else - models=("$hf_models") + hf_model="${{ env.DEFAULT_HF_MODEL }}" + hf_revision="${{ env.DEFAULT_HF_MODEL_REV }}" fi + hf_model_2="${{ matrix.test_suite.hf_model_2 }}" + hf_revision_2="${{ matrix.test_suite.hf_model_2_rev }}" - for model in "${models[@]}"; do - echo "Downloading $model ..." - case "$model" in - "ibm-ai-platform/micro-g3.3-8b-instruct-1b"*) - download_tinygranite "$model" & - ;; - "JackFram/llama-160m") - download_tinyllama "$model" & - ;; - "sentence-transformers/all-roberta-large-v1") - download_roberta_large "$model" & - ;; - "cross-encoder/stsb-roberta-large") - download_roberta_large "$model" & - ;; - *) - echo "No download method found for: $model"; - exit 1 - ;; - esac - done + python3 tools/download_model.py -m "$hf_model" -r "${hf_revision:-main}" & + + if [[ -n "$hf_model_2" ]]; then + python3 tools/download_model.py -m "$hf_model_2" -r "${hf_revision_2:-main}" & + fi wait @@ -231,6 +230,13 @@ jobs: path: ${{ env.model_path }} key: ${{ runner.os }}-hf-model-${{ env.model_key }} + - name: "Save HF models cache for additional model" + if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' ) + uses: actions/cache/save@v4 + with: + path: ${{ env.model_2_path }} + key: ${{ runner.os }}-hf-model-${{ env.model_2_key }} + - name: "Run tests" if: steps.changed-src-files.outputs.any_changed == 'true' env: diff --git a/tools/download_model.py b/tools/download_model.py new file mode 100755 index 000000000..816c8dce7 --- /dev/null +++ b/tools/download_model.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +"""Download a model from HuggingFace with revision. + +> python3 tools/download_model.py -m [-r ] + +""" + +import argparse +import logging + + +def download_granite_or_llama(model: str, revision: str = "main"): + from transformers import pipeline + pipeline('text-generation', model=model, revision=revision) + + +def download_roberta(model: str, revision: str = "main"): + from sentence_transformers import SentenceTransformer + SentenceTransformer(model, revision=revision) + + +download_methods = { + "ibm-ai-platform/micro-g3.3-8b-instruct-1b": download_granite_or_llama, + "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8": download_granite_or_llama, + "JackFram/llama-160m": download_granite_or_llama, + "cross-encoder/stsb-roberta-large": download_roberta, + "sentence-transformers/all-roberta-large-v1": download_roberta, +} + + +def download_model_with_revision(model: str, revision: str = "main"): + if model in download_methods: + download_method = download_methods.get(model) + logging.info("Downloading model '%s' with revision '%s' ...", model, + revision) + download_method(model, revision) + logging.info("Model '%s' with revision '%s' downloaded.", model, + revision) + else: + logging.error( + "No `download_method` found for model '%s'." + " Supported models: %s", model, str(list(download_methods.keys()))) + exit(1) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-m", dest="model", help="HuggingFace model ID") + parser.add_argument("-r", + dest="revision", + default="main", + help="Git hash, tag, or branch (default='main')") + args, _extra_args = parser.parse_known_args() + + if args.model: + download_model_with_revision(args.model, args.revision) + else: + logging.error("Need to provide a HuggingFace model ID.") + exit(1) + + +if __name__ == '__main__': + main() From d8e6a87d1adc308687e3201c49d6d1476b86a20a Mon Sep 17 00:00:00 2001 From: Christian Kadner Date: Mon, 13 Oct 2025 12:48:18 -0700 Subject: [PATCH 2/7] don't specify revision for DEFAULT_HF_MODEL Signed-off-by: Christian Kadner --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f5a3c70f7..580ca110d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,7 +21,7 @@ env: VLLM_PLUGINS: "spyre" HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub" DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b" - DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f" +# DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f" # DEFAULT_HF_MODEL_REV: "2714578f54cfb744ece40df9326ee0b47e879e03" concurrency: From 492a6278bfb2e31ad81d6c61bb5d9aa2e78e1066 Mon Sep 17 00:00:00 2001 From: Christian Kadner Date: Mon, 13 Oct 2025 13:21:05 -0700 Subject: [PATCH 3/7] formatting and help comments Signed-off-by: Christian Kadner --- .github/workflows/test.yml | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 580ca110d..44c94277e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -51,11 +51,11 @@ jobs: markers: "cpu and quantized and multi" flags: "--timeout=600 -k 'basic and test_output' --durations=0" hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8" - hf_model_rev: "main" - name: "embedding" markers: "cpu and embedding and not quantized" flags: "--timeout=300" hf_model: "sentence-transformers/all-roberta-large-v1" + hf_model_rev: "main" - name: "scoring" markers: "cpu and scoring" flags: "--timeout=300" @@ -157,6 +157,14 @@ jobs: - name: "Standardize HF model names for caching" id: standardize-names run: | + # replace '/' characters in HF_MODEL with '--' for GHA cache keys and + # in model file names in local HF hub cache + + # don't use in-line default values for variable expansion here to not + # use the default model revision with a non-default model like this: + # model="${{ matrix.test_suite.hf_model || env.DEFAULT_HF_MODEL }}" + # revision="${{ matrix.test_suite.hf_model_rev || env.DEFAULT_HF_MODEL_REV }}" + if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then model="${{ matrix.test_suite.hf_model }}" revision="${{ matrix.test_suite.hf_model_rev }}" @@ -164,17 +172,15 @@ jobs: model="${{ env.DEFAULT_HF_MODEL }}" revision="${{ env.DEFAULT_HF_MODEL_REV }}" fi - # replace '/' with '--' - safe_name="${model//\//--}" - echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV" - echo "model_path=${HF_HUB_CACHE}/models--$safe_name" >> "$GITHUB_ENV" + safe_name="${model//\//--}" + echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV" + echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV" if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then model_2="${{ matrix.test_suite.hf_model_2 }}" revision_2="${{ matrix.test_suite.hf_model_2_rev}}" - # replace '/' with '--' safe_name_2="${model_2//\//--}" - echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV" + echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV" echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV" fi @@ -206,19 +212,19 @@ jobs: # to expire after 7 days. if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then - hf_model="${{ matrix.test_suite.hf_model }}" - hf_revision="${{ matrix.test_suite.hf_model_rev }}" + model="${{ matrix.test_suite.hf_model }}" + revision="${{ matrix.test_suite.hf_model_rev }}" else - hf_model="${{ env.DEFAULT_HF_MODEL }}" - hf_revision="${{ env.DEFAULT_HF_MODEL_REV }}" + model="${{ env.DEFAULT_HF_MODEL }}" + revision="${{ env.DEFAULT_HF_MODEL_REV }}" fi - hf_model_2="${{ matrix.test_suite.hf_model_2 }}" - hf_revision_2="${{ matrix.test_suite.hf_model_2_rev }}" + model_2="${{ matrix.test_suite.hf_model_2 }}" + revision_2="${{ matrix.test_suite.hf_model_2_rev }}" - python3 tools/download_model.py -m "$hf_model" -r "${hf_revision:-main}" & + python3 tools/download_model.py -m "$model" -r "${revision:-main}" & - if [[ -n "$hf_model_2" ]]; then - python3 tools/download_model.py -m "$hf_model_2" -r "${hf_revision_2:-main}" & + if [[ -n "$model_2" ]]; then + python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" & fi wait From ca5ba0969ba9081cd79b0de7ddb04a406026961b Mon Sep 17 00:00:00 2001 From: Christian Kadner Date: Mon, 13 Oct 2025 15:54:52 -0700 Subject: [PATCH 4/7] cache same model revisions as specified in unit tests Signed-off-by: Christian Kadner --- .github/workflows/test.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 154f3056c..0a91cd2ce 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,8 +21,7 @@ env: VLLM_PLUGINS: "spyre" HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub" DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b" -# DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f" -# DEFAULT_HF_MODEL_REV: "2714578f54cfb744ece40df9326ee0b47e879e03" + DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -51,15 +50,17 @@ jobs: markers: "cpu and quantized and multi" flags: "--timeout=600 -k 'basic and test_output' --durations=0" hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8" + hf_model_rev: "0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e" - name: "embedding" markers: "cpu and embedding and not quantized" flags: "--timeout=300" hf_model: "sentence-transformers/all-roberta-large-v1" - hf_model_rev: "main" + hf_model_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" - name: "scoring" markers: "cpu and scoring" flags: "--timeout=300" hf_model: "cross-encoder/stsb-roberta-large" + hf_model_rev: "2b12c2c0088918e76151fd5937b7bba986ef1f98" - name: "continuous batching" markers: "cpu and cb and not quantized" flags: "--timeout=300 --durations=0 -s" @@ -84,7 +85,7 @@ jobs: markers: "compat or (cpu and basic)" flags: "--timeout=300" hf_model_2: "sentence-transformers/all-roberta-large-v1" - hf_model_2_rev: "main" + hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" os: "ubuntu-latest" python_version: "3.12" # Exclude vLLM:main if PR does NOT have "ready" label AND auto-merge is not enabled From b0aa69cc2a949b712b400030e8126986d3b6307e Mon Sep 17 00:00:00 2001 From: Christian Kadner Date: Mon, 13 Oct 2025 17:23:02 -0700 Subject: [PATCH 5/7] set model revision in test prompt logprobs Signed-off-by: Christian Kadner --- tests/e2e/test_spyre_prompt_logprobs.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/e2e/test_spyre_prompt_logprobs.py b/tests/e2e/test_spyre_prompt_logprobs.py index 260e1831a..e90e48e2b 100644 --- a/tests/e2e/test_spyre_prompt_logprobs.py +++ b/tests/e2e/test_spyre_prompt_logprobs.py @@ -78,7 +78,9 @@ def test_prompt_logprobs_not_supported_with_cb( monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1") with pytest.raises(ValueError, match="continuous batching"): - VllmConfig(model_config=ModelConfig(model=model.name, task="generate")) + VllmConfig(model_config=ModelConfig(model=model.name, + revision=model.revision, + task="generate")) @pytest.mark.skip @@ -137,7 +139,8 @@ def _get_hf_prompt_logprobs(model_info: ModelInfo, prompts, for each token""" tokenizer = AutoTokenizer.from_pretrained(model_info.name, revision=model_info.revision) - model = AutoModelForCausalLM.from_pretrained(model_info.name) + model = AutoModelForCausalLM.from_pretrained(model_info.name, + revision=model_info.revision) prompt_logprobs = {} for prompt in prompts: From 7c9d12f2058f30b4396b1da42dc9963a4ad2e974 Mon Sep 17 00:00:00 2001 From: Christian Kadner Date: Mon, 13 Oct 2025 17:24:03 -0700 Subject: [PATCH 6/7] YAPF! Signed-off-by: Christian Kadner --- tests/e2e/test_spyre_prompt_logprobs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/e2e/test_spyre_prompt_logprobs.py b/tests/e2e/test_spyre_prompt_logprobs.py index e90e48e2b..22bc1e2bf 100644 --- a/tests/e2e/test_spyre_prompt_logprobs.py +++ b/tests/e2e/test_spyre_prompt_logprobs.py @@ -78,9 +78,8 @@ def test_prompt_logprobs_not_supported_with_cb( monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1") with pytest.raises(ValueError, match="continuous batching"): - VllmConfig(model_config=ModelConfig(model=model.name, - revision=model.revision, - task="generate")) + VllmConfig(model_config=ModelConfig( + model=model.name, revision=model.revision, task="generate")) @pytest.mark.skip From 89c4f46fe2927beeffb65de80c965f8a719331a5 Mon Sep 17 00:00:00 2001 From: Prashant Gupta Date: Tue, 14 Oct 2025 16:54:26 -0700 Subject: [PATCH 7/7] get_tokenizer with revision for GTI Signed-off-by: Prashant Gupta Signed-off-by: Christian Kadner --- tests/golden_token_injector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/golden_token_injector.py b/tests/golden_token_injector.py index 126246eff..a85b1eef7 100644 --- a/tests/golden_token_injector.py +++ b/tests/golden_token_injector.py @@ -49,7 +49,9 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device, # for couple requests that does not have too much impact. # But since this is used mostly for validation, it would be fine # to keep them. - self.tokenizer = get_tokenizer(vllm_config.model_config.tokenizer) + self.tokenizer = get_tokenizer( + vllm_config.model_config.tokenizer, + revision=vllm_config.model_config.revision) def is_argmax_invariant(self) -> bool: """Never impacts greedy sampling"""