vllm-project · prashantgupta24 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
@@ -21,6 +21,7 @@ env:
   VLLM_PLUGINS: "spyre"
   HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"
   DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
+  DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -44,19 +45,22 @@ jobs:
           - name: "static batching"
             markers: "cpu and decoder and not cb and not other_e2e and not quantized"
             flags: "--timeout=300"
-            hf_models: "JackFram/llama-160m"
+            hf_model: "JackFram/llama-160m"
           - name: "fp8"
             markers: "cpu and quantized and multi"
             flags: "--timeout=600 -k 'basic and test_output' --durations=0"
-            hf_models: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
+            hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
+            hf_model_rev: "0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e"
           - name: "embedding"
             markers: "cpu and embedding and not quantized"
             flags: "--timeout=300"
-            hf_models: "sentence-transformers/all-roberta-large-v1"
+            hf_model: "sentence-transformers/all-roberta-large-v1"
+            hf_model_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
           - name: "scoring"
             markers: "cpu and scoring"
             flags: "--timeout=300"
-            hf_models: "cross-encoder/stsb-roberta-large"
+            hf_model: "cross-encoder/stsb-roberta-large"
+            hf_model_rev: "2b12c2c0088918e76151fd5937b7bba986ef1f98"
           - name: "continuous batching"
             markers: "cpu and cb and not quantized"
             flags: "--timeout=300  --durations=0 -s"
@@ -80,7 +84,8 @@ jobs:
               name: "backward compat"
               markers: "compat or (cpu and basic)"
               flags: "--timeout=300"
-              hf_models: "micro-g3.3_roberta-large"
+              hf_model_2: "sentence-transformers/all-roberta-large-v1"
+              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
             os: "ubuntu-latest"
             python_version: "3.12"
         # Exclude vLLM:main if PR does NOT have "ready" label AND auto-merge is not enabled
@@ -150,19 +155,34 @@ jobs:
           # overwritten.
           uv pip install -v .
 
-      # Standardize model name for cache keys
-      - name: Standardize HF model name
+      - name: "Standardize HF model names for caching"
         id: standardize-names
         run: |
-          model="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}"
-          if [[ "$model" == "micro-g3.3_roberta-large" ]]; then
-            echo "model_key=micro-g3.3_roberta-large" >> "$GITHUB_ENV"
-            echo "model_path=${HF_HUB_CACHE}" >> "$GITHUB_ENV"
+          # replace '/' characters in HF_MODEL with '--' for GHA cache keys and
+          # in model file names in local HF hub cache
+
+          # don't use in-line default values for variable expansion here to not
+          # use the default model revision with a non-default model like this:
+          #   model="${{ matrix.test_suite.hf_model || env.DEFAULT_HF_MODEL }}"
+          #   revision="${{ matrix.test_suite.hf_model_rev || env.DEFAULT_HF_MODEL_REV }}"
+
+          if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
+            model="${{ matrix.test_suite.hf_model }}"
+            revision="${{ matrix.test_suite.hf_model_rev }}"
           else
-            # replace / with --
-            safe_name="${model//\//--}"
-            echo "model_key=$safe_name" >> "$GITHUB_ENV"
-            echo "model_path=${HF_HUB_CACHE}/models--$safe_name" >> "$GITHUB_ENV"
+            model="${{ env.DEFAULT_HF_MODEL }}"
+            revision="${{ env.DEFAULT_HF_MODEL_REV }}"
+          fi
+          safe_name="${model//\//--}"
+          echo "model_key=${safe_name}_${revision}"              >> "$GITHUB_ENV"
+          echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV"
+
+          if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then
+            model_2="${{ matrix.test_suite.hf_model_2 }}"
+            revision_2="${{ matrix.test_suite.hf_model_2_rev}}"
+            safe_name_2="${model_2//\//--}"
+            echo "model_2_key=${safe_name_2}_${revision_2}"            >> "$GITHUB_ENV"
+            echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV"
           fi
 
       - name: "Restore HF models cache"
@@ -173,54 +193,40 @@ jobs:
           path: ${{ env.model_path }}
           key: ${{ runner.os }}-hf-model-${{ env.model_key }}
 
+      - name: "Restore HF models cache for additional model"
+        id: cache_restore_2
+        if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 )
+        uses: actions/cache/restore@v4
+        with:
+          path: ${{ env.model_2_path }}
+          key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
+
       - name: "Download HF models"
-        if: ( steps.changed-src-files.outputs.any_changed == 'true' && steps.cache_restore.outputs.cache-hit != 'true' )
+        if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' || steps.cache_restore_2.outputs.cache-hit != 'true'))
         run: |
-          # We are caching HF models (HF_HUB_CACHE) for reliability rather than speed, since HF downloads are flaky for concurrent jobs.
-          # Be careful when adding models to the cache here, as the GHA cache is limited to 10 GB.
-          # If a new model is added here, a new hash key is generated. The previous cache blob can then
-          # be removed by an admin or can be left to expire after 7 days.
-
-          download_tinygranite() {
-            python -c "from transformers import pipeline, AutoTokenizer; pipeline('text-generation', model='$1'); tokenizer=AutoTokenizer.from_pretrained('$1')"
-          }
-          download_roberta_large() {
-            python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('$1')"
-          }
-          # tinyllama used for static batching tests because static batching is _too slow_
-          download_tinyllama() {
-            python -c "from transformers import pipeline; pipeline('text-generation', model='$1')"
-          }
-
-          hf_models="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}"
-
-          if [[ "$hf_models" == "micro-g3.3_roberta-large" ]]; then
-            models=("ibm-ai-platform/micro-g3.3-8b-instruct-1b" "sentence-transformers/all-roberta-large-v1")
+          # We are caching HF models (HF_HUB_CACHE) for reliability rather than
+          # speed, since HF downloads are flaky for concurrent jobs.
+          # Be careful when adding models to the cache here, as the GHA cache is
+          # limited to 10 GB.
+          # If a new model is added here, a new hash key is generated. The
+          # previous cache blob can then be removed by an admin or can be left
+          # to expire after 7 days.
+
+          if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
+            model="${{ matrix.test_suite.hf_model }}"
+            revision="${{ matrix.test_suite.hf_model_rev }}"
           else
-            models=("$hf_models")
+            model="${{ env.DEFAULT_HF_MODEL }}"
+            revision="${{ env.DEFAULT_HF_MODEL_REV }}"
           fi
+          model_2="${{ matrix.test_suite.hf_model_2 }}"
+          revision_2="${{ matrix.test_suite.hf_model_2_rev }}"
 
-          for model in "${models[@]}"; do
-            echo "Downloading $model ..."
-            case "$model" in
-              "ibm-ai-platform/micro-g3.3-8b-instruct-1b"*)
-                download_tinygranite "$model" &
-                ;;
-              "JackFram/llama-160m")
-                download_tinyllama "$model" &
-                ;;
-              "sentence-transformers/all-roberta-large-v1")
-                download_roberta_large "$model" &
-                ;;
-              "cross-encoder/stsb-roberta-large")
-                download_roberta_large "$model" &
-                ;;
-              *)
-                echo "No download method found for: $model";
-                exit 1
-                ;;
-            esac
-          done
+          python3 tools/download_model.py -m "$model" -r "${revision:-main}" &
+
+          if [[ -n "$model_2" ]]; then
+            python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" &
+          fi
 
           wait
 
@@ -231,6 +237,13 @@ jobs:
           path: ${{ env.model_path }}
           key: ${{ runner.os }}-hf-model-${{ env.model_key }}
 
+      - name: "Save HF models cache for additional model"
+        if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' )
+        uses: actions/cache/save@v4
+        with:
+          path: ${{ env.model_2_path }}
+          key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
+
       - name: "Run tests"
         if: steps.changed-src-files.outputs.any_changed == 'true'
         env:

@@ -78,7 +78,8 @@ def test_prompt_logprobs_not_supported_with_cb(
     monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1")
 
     with pytest.raises(ValueError, match="continuous batching"):
-        VllmConfig(model_config=ModelConfig(model=model.name, task="generate"))
+        VllmConfig(model_config=ModelConfig(
+            model=model.name, revision=model.revision, task="generate"))
 
 
 @pytest.mark.skip
@@ -137,7 +138,8 @@ def _get_hf_prompt_logprobs(model_info: ModelInfo, prompts,
     for each token"""
     tokenizer = AutoTokenizer.from_pretrained(model_info.name,
                                               revision=model_info.revision)
-    model = AutoModelForCausalLM.from_pretrained(model_info.name)
+    model = AutoModelForCausalLM.from_pretrained(model_info.name,
+                                                 revision=model_info.revision)
 
     prompt_logprobs = {}
     for prompt in prompts:

@@ -49,7 +49,9 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device,
         # for couple requests that does not have too much impact.
         # But since this is used mostly for validation, it would be fine
         # to keep them.
-        self.tokenizer = get_tokenizer(vllm_config.model_config.tokenizer)
+        self.tokenizer = get_tokenizer(
+            vllm_config.model_config.tokenizer,
+            revision=vllm_config.model_config.revision)
 
     def is_argmax_invariant(self) -> bool:
         """Never impacts greedy sampling"""

@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+"""Download a model from HuggingFace with revision.
+
+> python3 tools/download_model.py -m <HF-model-id> [-r <git-tag-or-hash>]
+
+"""
+
+import argparse
+import logging
+
+
+def download_granite_or_llama(model: str, revision: str = "main"):
+    from transformers import pipeline
+    pipeline('text-generation', model=model, revision=revision)
+
+
+def download_roberta(model: str, revision: str = "main"):
+    from sentence_transformers import SentenceTransformer
+    SentenceTransformer(model, revision=revision)
+
+
+download_methods = {
+    "ibm-ai-platform/micro-g3.3-8b-instruct-1b": download_granite_or_llama,
+    "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8": download_granite_or_llama,
+    "JackFram/llama-160m": download_granite_or_llama,
+    "cross-encoder/stsb-roberta-large": download_roberta,
+    "sentence-transformers/all-roberta-large-v1": download_roberta,
+}
+
+
+def download_model_with_revision(model: str, revision: str = "main"):
+    if model in download_methods:
+        download_method = download_methods.get(model)
+        logging.info("Downloading model '%s' with revision '%s' ...", model,
+                     revision)
+        download_method(model, revision)
+        logging.info("Model '%s' with revision '%s' downloaded.", model,
+                     revision)
+    else:
+        logging.error(
+            "No `download_method` found for model '%s'."
+            " Supported models: %s", model, str(list(download_methods.keys())))
+        exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", dest="model", help="HuggingFace model ID")
+    parser.add_argument("-r",
+                        dest="revision",
+                        default="main",
+                        help="Git hash, tag, or branch (default='main')")
+    args, _extra_args = parser.parse_known_args()
+
+    if args.model:
+        download_model_with_revision(args.model, args.revision)
+    else:
+        logging.error("Need to provide a HuggingFace model ID.")
+        exit(1)
+
+
+if __name__ == '__main__':
+    main()