vllm-project · ckadner · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
@@ -21,6 +21,8 @@ env:
   VLLM_PLUGINS: "spyre"
   HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"
   DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
+  DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"
+#  DEFAULT_HF_MODEL_REV: "2714578f54cfb744ece40df9326ee0b47e879e03"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -44,19 +46,20 @@ jobs:
           - name: "static batching"
             markers: "cpu and decoder and not cb and not other_e2e"
             flags: "--timeout=300"
-            hf_models: "JackFram/llama-160m"
+            hf_model: "JackFram/llama-160m"
           - name: "fp8"
             markers: "cpu and quantized and multi"
             flags: "--timeout=600 -k 'basic and test_output' --durations=0"
-            hf_models: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
+            hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
+            hf_model_rev: "main"
           - name: "embedding"
             markers: "cpu and embedding"
             flags: "--timeout=300"
-            hf_models: "sentence-transformers/all-roberta-large-v1"
+            hf_model: "sentence-transformers/all-roberta-large-v1"
           - name: "scoring"
             markers: "cpu and scoring"
             flags: "--timeout=300"
-            hf_models: "cross-encoder/stsb-roberta-large"
+            hf_model: "cross-encoder/stsb-roberta-large"
           - name: "continuous batching"
             markers: "cpu and cb"
             flags: "--timeout=300  --durations=0 -s"
@@ -80,7 +83,8 @@ jobs:
               name: "backward compat"
               markers: "compat or (cpu and basic)"
               flags: "--timeout=300"
-              hf_models: "micro-g3.3_roberta-large"
+              hf_model_2: "sentence-transformers/all-roberta-large-v1"
+              hf_model_2_rev: "main"
             os: "ubuntu-latest"
             python_version: "3.12"
         # Exclude vLLM:main if PR does NOT have "ready" label AND auto-merge is not enabled
@@ -150,86 +154,88 @@ jobs:
           # overwritten.
           uv pip install -v .
 
-      # Standardize model name for cache keys
-      - name: Standardize HF model name
+      - name: "Standardize HF model names for caching"
         id: standardize-names
         run: |
-          model="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}"
-          if [[ "$model" == "micro-g3.3_roberta-large" ]]; then
-            echo "model_key=micro-g3.3_roberta-large" >> "$GITHUB_ENV"
-            echo "model_path=${HF_HUB_CACHE}" >> "$GITHUB_ENV"
+          if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
+            model="${{ matrix.test_suite.hf_model }}"
+            revision="${{ matrix.test_suite.hf_model_rev }}"
           else
-            # replace / with --
-            safe_name="${model//\//--}"
-            echo "model_key=$safe_name" >> "$GITHUB_ENV"
-            echo "model_path=${HF_HUB_CACHE}/models--$safe_name" >> "$GITHUB_ENV"
+            model="${{ env.DEFAULT_HF_MODEL }}"
+            revision="${{ env.DEFAULT_HF_MODEL_REV }}"
+          fi
+          # replace '/' with '--'
+          safe_name="${model//\//--}"
+          echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV"
+          echo "model_path=${HF_HUB_CACHE}/models--$safe_name" >> "$GITHUB_ENV"
+
+          if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then
+            model_2="${{ matrix.test_suite.hf_model_2 }}"
+            revision_2="${{ matrix.test_suite.hf_model_2_rev}}"
+            # replace '/' with '--'
+            safe_name_2="${model_2//\//--}"
+            echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV"
+            echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV"
           fi
 
-      - name: "Restore HF models cache"
-        id: cache_restore
-        if: steps.changed-src-files.outputs.any_changed == 'true'
-        uses: actions/cache/restore@v4
-        with:
-          path: ${{ env.model_path }}
-          key: ${{ runner.os }}-hf-model-${{ env.model_key }}
+#      - name: "Restore HF models cache"
+#        id: cache_restore
+#        if: steps.changed-src-files.outputs.any_changed == 'true'
+#        uses: actions/cache/restore@v4
+#        with:
+#          path: ${{ env.model_path }}
+#          key: ${{ runner.os }}-hf-model-${{ env.model_key }}
+#
+#      - name: "Restore HF models cache for additional model"
+#        id: cache_restore_2
+#        if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 )
+#        uses: actions/cache/restore@v4
+#        with:
+#          path: ${{ env.model_2_path }}
+#          key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
 
       - name: "Download HF models"
-        if: ( steps.changed-src-files.outputs.any_changed == 'true' && steps.cache_restore.outputs.cache-hit != 'true' )
+#        if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' || steps.cache_restore_2.outputs.cache-hit != 'true'))
         run: |
-          # We are caching HF models (HF_HUB_CACHE) for reliability rather than speed, since HF downloads are flaky for concurrent jobs.
-          # Be careful when adding models to the cache here, as the GHA cache is limited to 10 GB.
-          # If a new model is added here, a new hash key is generated. The previous cache blob can then
-          # be removed by an admin or can be left to expire after 7 days.
-
-          download_tinygranite() {
-            python -c "from transformers import pipeline, AutoTokenizer; pipeline('text-generation', model='$1'); tokenizer=AutoTokenizer.from_pretrained('$1')"
-          }
-          download_roberta_large() {
-            python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('$1')"
-          }
-          # tinyllama used for static batching tests because static batching is _too slow_
-          download_tinyllama() {
-            python -c "from transformers import pipeline; pipeline('text-generation', model='$1')"
-          }
-
-          hf_models="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}"
-
-          if [[ "$hf_models" == "micro-g3.3_roberta-large" ]]; then
-            models=("ibm-ai-platform/micro-g3.3-8b-instruct-1b" "sentence-transformers/all-roberta-large-v1")
+          # We are caching HF models (HF_HUB_CACHE) for reliability rather than
+          # speed, since HF downloads are flaky for concurrent jobs.
+          # Be careful when adding models to the cache here, as the GHA cache is
+          # limited to 10 GB.
+          # If a new model is added here, a new hash key is generated. The
+          # previous cache blob can then be removed by an admin or can be left
+          # to expire after 7 days.
+
+          if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
+            hf_model="${{ matrix.test_suite.hf_model }}"
+            hf_revision="${{ matrix.test_suite.hf_model_rev }}"
           else
-            models=("$hf_models")
+            hf_model="${{ env.DEFAULT_HF_MODEL }}"
+            hf_revision="${{ env.DEFAULT_HF_MODEL_REV }}"
           fi
+          hf_model_2="${{ matrix.test_suite.hf_model_2 }}"
+          hf_revision_2="${{ matrix.test_suite.hf_model_2_rev }}"
 
-          for model in "${models[@]}"; do
-            echo "Downloading $model ..."
-            case "$model" in
-              "ibm-ai-platform/micro-g3.3-8b-instruct-1b"*)
-                download_tinygranite "$model" &
-                ;;
-              "JackFram/llama-160m")
-                download_tinyllama "$model" &
-                ;;
-              "sentence-transformers/all-roberta-large-v1")
-                download_roberta_large "$model" &
-                ;;
-              "cross-encoder/stsb-roberta-large")
-                download_roberta_large "$model" &
-                ;;
-              *)
-                echo "No download method found for: $model";
-                exit 1
-                ;;
-            esac
-          done
+          python3 tools/download_model.py -m "$hf_model" -r "${hf_revision:-main}" &
 
+          if [[ -n "$hf_model_2" ]]; then
+            python3 tools/download_model.py -m "$hf_model_2" -r "${hf_revision_2:-main}" &
+          fi
+
           wait
 
-      - name: "Save HF models cache"
-        if: ( steps.changed-src-files.outputs.any_changed == 'true' && github.event_name != 'pull_request' && steps.cache_restore.outputs.cache-hit != 'true' )
-        uses: actions/cache/save@v4
-        with:
-          path: ${{ env.model_path }}
-          key: ${{ runner.os }}-hf-model-${{ env.model_key }}
+#      - name: "Save HF models cache"
+#        if: ( steps.changed-src-files.outputs.any_changed == 'true' && github.event_name != 'pull_request' && steps.cache_restore.outputs.cache-hit != 'true' )
+#        uses: actions/cache/save@v4
+#        with:
+#          path: ${{ env.model_path }}
+#          key: ${{ runner.os }}-hf-model-${{ env.model_key }}
+#
+#      - name: "Save HF models cache for additional model"
+#        if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' )
+#        uses: actions/cache/save@v4
+#        with:
+#          path: ${{ env.model_2_path }}
+#          key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
 
       - name: "Run tests"
         if: steps.changed-src-files.outputs.any_changed == 'true'

@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+"""Download a model from HuggingFace with revision.
+
+> python3 tools/download_model.py -m <HF-model-id> [-r <git-tag-or-hash>]
+
+"""
+
+import argparse
+import logging
+
+
+def download_granite_or_llama(hf_model_id: str, revision: str = "main"):
+    from transformers import pipeline
+    pipeline('text-generation', model=hf_model_id, revision=revision)
+
+
+def download_roberta(hf_model_id: str, revision: str = "main"):
+    from sentence_transformers import SentenceTransformer
+    SentenceTransformer(hf_model_id, revision=revision)
+
+
+download_methods = {
+    "ibm-ai-platform/micro-g3.3-8b-instruct-1b": download_granite_or_llama,
+    "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8": download_granite_or_llama,
+    "JackFram/llama-160m": download_granite_or_llama,
+    "cross-encoder/stsb-roberta-large": download_roberta,
+    "sentence-transformers/all-roberta-large-v1": download_roberta,
+}
+
+
+def download_model_with_revision(hf_model_id: str, revision: str = "main"):
+    if hf_model_id in download_methods:
+        download_method = download_methods.get(hf_model_id)
+        logging.info("Downloading model '%s' with revision '%s' ...",
+                     hf_model_id, revision)
+        download_method(hf_model_id, revision)
+        logging.info("Model '%s' with revision '%s' downloaded.", hf_model_id,
+                     revision)
+    else:
+        logging.error(
+            "No `download_method` found for model '%s'."
+            " Supported models: %s", hf_model_id,
+            str(list(download_methods.keys())))
+        exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-m',
+                        dest='hf_model_id',
+                        help='HuggingFace model ID.')
+    parser.add_argument('-r',
+                        dest='revision',
+                        default="main",
+                        help='Git tag, hash, or branch.')
+
+    args, _extra_args = parser.parse_known_args()
+
+    if args.hf_model_id:
+        download_model_with_revision(args.hf_model_id, args.revision)
+    else:
+        logging.error("Need to specify a model ID with -model.")
+        exit(1)
+
+
+if __name__ == '__main__':
+    main()