From e8794ed3e516ab31e3dffe6dc53ab955e3b300e9 Mon Sep 17 00:00:00 2001
From: Christian Kadner <ckadner@us.ibm.com>
Date: Mon, 13 Oct 2025 12:40:23 -0700
Subject: [PATCH 1/7] GHA test with model revisions

Signed-off-by: Christian Kadner <ckadner@us.ibm.com>
---
 .github/workflows/test.yml | 118 +++++++++++++++++++------------------
 tools/download_model.py    |  63 ++++++++++++++++++++
 2 files changed, 125 insertions(+), 56 deletions(-)
 create mode 100755 tools/download_model.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8373376b3..f5a3c70f7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,6 +21,8 @@ env:
   VLLM_PLUGINS: "spyre"
   HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"
   DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
+  DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"
+#  DEFAULT_HF_MODEL_REV: "2714578f54cfb744ece40df9326ee0b47e879e03"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -44,19 +46,20 @@ jobs:
           - name: "static batching"
             markers: "cpu and decoder and not cb and not other_e2e and not quantized"
             flags: "--timeout=300"
-            hf_models: "JackFram/llama-160m"
+            hf_model: "JackFram/llama-160m"
           - name: "fp8"
             markers: "cpu and quantized and multi"
             flags: "--timeout=600 -k 'basic and test_output' --durations=0"
-            hf_models: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
+            hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
+            hf_model_rev: "main"
           - name: "embedding"
             markers: "cpu and embedding and not quantized"
             flags: "--timeout=300"
-            hf_models: "sentence-transformers/all-roberta-large-v1"
+            hf_model: "sentence-transformers/all-roberta-large-v1"
           - name: "scoring"
             markers: "cpu and scoring"
             flags: "--timeout=300"
-            hf_models: "cross-encoder/stsb-roberta-large"
+            hf_model: "cross-encoder/stsb-roberta-large"
           - name: "continuous batching"
             markers: "cpu and cb and not quantized"
             flags: "--timeout=300  --durations=0 -s"
@@ -80,7 +83,8 @@ jobs:
               name: "backward compat"
               markers: "compat or (cpu and basic)"
               flags: "--timeout=300"
-              hf_models: "micro-g3.3_roberta-large"
+              hf_model_2: "sentence-transformers/all-roberta-large-v1"
+              hf_model_2_rev: "main"
             os: "ubuntu-latest"
             python_version: "3.12"
         # Exclude vLLM:main if PR does NOT have "ready" label AND auto-merge is not enabled
@@ -150,19 +154,28 @@ jobs:
           # overwritten.
           uv pip install -v .
 
-      # Standardize model name for cache keys
-      - name: Standardize HF model name
+      - name: "Standardize HF model names for caching"
         id: standardize-names
         run: |
-          model="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}"
-          if [[ "$model" == "micro-g3.3_roberta-large" ]]; then
-            echo "model_key=micro-g3.3_roberta-large" >> "$GITHUB_ENV"
-            echo "model_path=${HF_HUB_CACHE}" >> "$GITHUB_ENV"
+          if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
+            model="${{ matrix.test_suite.hf_model }}"
+            revision="${{ matrix.test_suite.hf_model_rev }}"
           else
-            # replace / with --
+            model="${{ env.DEFAULT_HF_MODEL }}"
+            revision="${{ env.DEFAULT_HF_MODEL_REV }}"
+          fi
+          # replace '/' with '--'
             safe_name="${model//\//--}"
-            echo "model_key=$safe_name" >> "$GITHUB_ENV"
+          echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV"
             echo "model_path=${HF_HUB_CACHE}/models--$safe_name" >> "$GITHUB_ENV"
+          
+          if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then
+            model_2="${{ matrix.test_suite.hf_model_2 }}"
+            revision_2="${{ matrix.test_suite.hf_model_2_rev}}"
+            # replace '/' with '--'
+            safe_name_2="${model_2//\//--}"
+            echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV"
+            echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV"
           fi
 
       - name: "Restore HF models cache"
@@ -173,54 +186,40 @@ jobs:
           path: ${{ env.model_path }}
           key: ${{ runner.os }}-hf-model-${{ env.model_key }}
 
+      - name: "Restore HF models cache for additional model"
+        id: cache_restore_2
+        if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 )
+        uses: actions/cache/restore@v4
+        with:
+          path: ${{ env.model_2_path }}
+          key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
+
       - name: "Download HF models"
-        if: ( steps.changed-src-files.outputs.any_changed == 'true' && steps.cache_restore.outputs.cache-hit != 'true' )
+        if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' || steps.cache_restore_2.outputs.cache-hit != 'true'))
         run: |
-          # We are caching HF models (HF_HUB_CACHE) for reliability rather than speed, since HF downloads are flaky for concurrent jobs.
-          # Be careful when adding models to the cache here, as the GHA cache is limited to 10 GB.
-          # If a new model is added here, a new hash key is generated. The previous cache blob can then
-          # be removed by an admin or can be left to expire after 7 days.
-
-          download_tinygranite() {
-            python -c "from transformers import pipeline, AutoTokenizer; pipeline('text-generation', model='$1'); tokenizer=AutoTokenizer.from_pretrained('$1')"
-          }
-          download_roberta_large() {
-            python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('$1')"
-          }
-          # tinyllama used for static batching tests because static batching is _too slow_
-          download_tinyllama() {
-            python -c "from transformers import pipeline; pipeline('text-generation', model='$1')"
-          }
-
-          hf_models="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}"
-
-          if [[ "$hf_models" == "micro-g3.3_roberta-large" ]]; then
-            models=("ibm-ai-platform/micro-g3.3-8b-instruct-1b" "sentence-transformers/all-roberta-large-v1")
+          # We are caching HF models (HF_HUB_CACHE) for reliability rather than
+          # speed, since HF downloads are flaky for concurrent jobs.
+          # Be careful when adding models to the cache here, as the GHA cache is
+          # limited to 10 GB.
+          # If a new model is added here, a new hash key is generated. The
+          # previous cache blob can then be removed by an admin or can be left
+          # to expire after 7 days.
+
+          if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
+            hf_model="${{ matrix.test_suite.hf_model }}"
+            hf_revision="${{ matrix.test_suite.hf_model_rev }}"
           else
-            models=("$hf_models")
+            hf_model="${{ env.DEFAULT_HF_MODEL }}"
+            hf_revision="${{ env.DEFAULT_HF_MODEL_REV }}"
           fi
+          hf_model_2="${{ matrix.test_suite.hf_model_2 }}"
+          hf_revision_2="${{ matrix.test_suite.hf_model_2_rev }}"
 
-          for model in "${models[@]}"; do
-            echo "Downloading $model ..."
-            case "$model" in
-              "ibm-ai-platform/micro-g3.3-8b-instruct-1b"*)
-                download_tinygranite "$model" &
-                ;;
-              "JackFram/llama-160m")
-                download_tinyllama "$model" &
-                ;;
-              "sentence-transformers/all-roberta-large-v1")
-                download_roberta_large "$model" &
-                ;;
-              "cross-encoder/stsb-roberta-large")
-                download_roberta_large "$model" &
-                ;;
-              *)
-                echo "No download method found for: $model";
-                exit 1
-                ;;
-            esac
-          done
+          python3 tools/download_model.py -m "$hf_model" -r "${hf_revision:-main}" &
+
+          if [[ -n "$hf_model_2" ]]; then
+            python3 tools/download_model.py -m "$hf_model_2" -r "${hf_revision_2:-main}" &
+          fi
 
           wait
 
@@ -231,6 +230,13 @@ jobs:
           path: ${{ env.model_path }}
           key: ${{ runner.os }}-hf-model-${{ env.model_key }}
 
+      - name: "Save HF models cache for additional model"
+        if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' )
+        uses: actions/cache/save@v4
+        with:
+          path: ${{ env.model_2_path }}
+          key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}
+
       - name: "Run tests"
         if: steps.changed-src-files.outputs.any_changed == 'true'
         env:
diff --git a/tools/download_model.py b/tools/download_model.py
new file mode 100755
index 000000000..816c8dce7
--- /dev/null
+++ b/tools/download_model.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+"""Download a model from HuggingFace with revision.
+
+> python3 tools/download_model.py -m <HF-model-id> [-r <git-tag-or-hash>]
+
+"""
+
+import argparse
+import logging
+
+
+def download_granite_or_llama(model: str, revision: str = "main"):
+    from transformers import pipeline
+    pipeline('text-generation', model=model, revision=revision)
+
+
+def download_roberta(model: str, revision: str = "main"):
+    from sentence_transformers import SentenceTransformer
+    SentenceTransformer(model, revision=revision)
+
+
+download_methods = {
+    "ibm-ai-platform/micro-g3.3-8b-instruct-1b": download_granite_or_llama,
+    "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8": download_granite_or_llama,
+    "JackFram/llama-160m": download_granite_or_llama,
+    "cross-encoder/stsb-roberta-large": download_roberta,
+    "sentence-transformers/all-roberta-large-v1": download_roberta,
+}
+
+
+def download_model_with_revision(model: str, revision: str = "main"):
+    if model in download_methods:
+        download_method = download_methods.get(model)
+        logging.info("Downloading model '%s' with revision '%s' ...", model,
+                     revision)
+        download_method(model, revision)
+        logging.info("Model '%s' with revision '%s' downloaded.", model,
+                     revision)
+    else:
+        logging.error(
+            "No `download_method` found for model '%s'."
+            " Supported models: %s", model, str(list(download_methods.keys())))
+        exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", dest="model", help="HuggingFace model ID")
+    parser.add_argument("-r",
+                        dest="revision",
+                        default="main",
+                        help="Git hash, tag, or branch (default='main')")
+    args, _extra_args = parser.parse_known_args()
+
+    if args.model:
+        download_model_with_revision(args.model, args.revision)
+    else:
+        logging.error("Need to provide a HuggingFace model ID.")
+        exit(1)
+
+
+if __name__ == '__main__':
+    main()

From d8e6a87d1adc308687e3201c49d6d1476b86a20a Mon Sep 17 00:00:00 2001
From: Christian Kadner <ckadner@us.ibm.com>
Date: Mon, 13 Oct 2025 12:48:18 -0700
Subject: [PATCH 2/7] don't specify revision for DEFAULT_HF_MODEL

Signed-off-by: Christian Kadner <ckadner@us.ibm.com>
---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f5a3c70f7..580ca110d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,7 +21,7 @@ env:
   VLLM_PLUGINS: "spyre"
   HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"
   DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
-  DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"
+#  DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"
 #  DEFAULT_HF_MODEL_REV: "2714578f54cfb744ece40df9326ee0b47e879e03"
 
 concurrency:

From 492a6278bfb2e31ad81d6c61bb5d9aa2e78e1066 Mon Sep 17 00:00:00 2001
From: Christian Kadner <ckadner@us.ibm.com>
Date: Mon, 13 Oct 2025 13:21:05 -0700
Subject: [PATCH 3/7] formatting and help comments

Signed-off-by: Christian Kadner <ckadner@us.ibm.com>
---
 .github/workflows/test.yml | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 580ca110d..44c94277e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -51,11 +51,11 @@ jobs:
             markers: "cpu and quantized and multi"
             flags: "--timeout=600 -k 'basic and test_output' --durations=0"
             hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
-            hf_model_rev: "main"
           - name: "embedding"
             markers: "cpu and embedding and not quantized"
             flags: "--timeout=300"
             hf_model: "sentence-transformers/all-roberta-large-v1"
+            hf_model_rev: "main"
           - name: "scoring"
             markers: "cpu and scoring"
             flags: "--timeout=300"
@@ -157,6 +157,14 @@ jobs:
       - name: "Standardize HF model names for caching"
         id: standardize-names
         run: |
+          # replace '/' characters in HF_MODEL with '--' for GHA cache keys and
+          # in model file names in local HF hub cache
+          
+          # don't use in-line default values for variable expansion here to not
+          # use the default model revision with a non-default model like this:
+          #   model="${{ matrix.test_suite.hf_model || env.DEFAULT_HF_MODEL }}"
+          #   revision="${{ matrix.test_suite.hf_model_rev || env.DEFAULT_HF_MODEL_REV }}"
+          
           if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
             model="${{ matrix.test_suite.hf_model }}"
             revision="${{ matrix.test_suite.hf_model_rev }}"
@@ -164,17 +172,15 @@ jobs:
             model="${{ env.DEFAULT_HF_MODEL }}"
             revision="${{ env.DEFAULT_HF_MODEL_REV }}"
           fi
-          # replace '/' with '--'
-            safe_name="${model//\//--}"
-          echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV"
-            echo "model_path=${HF_HUB_CACHE}/models--$safe_name" >> "$GITHUB_ENV"
+          safe_name="${model//\//--}"
+          echo "model_key=${safe_name}_${revision}"              >> "$GITHUB_ENV"
+          echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV"
           
           if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then
             model_2="${{ matrix.test_suite.hf_model_2 }}"
             revision_2="${{ matrix.test_suite.hf_model_2_rev}}"
-            # replace '/' with '--'
             safe_name_2="${model_2//\//--}"
-            echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV"
+            echo "model_2_key=${safe_name_2}_${revision_2}"            >> "$GITHUB_ENV"
             echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV"
           fi
 
@@ -206,19 +212,19 @@ jobs:
           # to expire after 7 days.
 
           if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
-            hf_model="${{ matrix.test_suite.hf_model }}"
-            hf_revision="${{ matrix.test_suite.hf_model_rev }}"
+            model="${{ matrix.test_suite.hf_model }}"
+            revision="${{ matrix.test_suite.hf_model_rev }}"
           else
-            hf_model="${{ env.DEFAULT_HF_MODEL }}"
-            hf_revision="${{ env.DEFAULT_HF_MODEL_REV }}"
+            model="${{ env.DEFAULT_HF_MODEL }}"
+            revision="${{ env.DEFAULT_HF_MODEL_REV }}"
           fi
-          hf_model_2="${{ matrix.test_suite.hf_model_2 }}"
-          hf_revision_2="${{ matrix.test_suite.hf_model_2_rev }}"
+          model_2="${{ matrix.test_suite.hf_model_2 }}"
+          revision_2="${{ matrix.test_suite.hf_model_2_rev }}"
 
-          python3 tools/download_model.py -m "$hf_model" -r "${hf_revision:-main}" &
+          python3 tools/download_model.py -m "$model" -r "${revision:-main}" &
 
-          if [[ -n "$hf_model_2" ]]; then
-            python3 tools/download_model.py -m "$hf_model_2" -r "${hf_revision_2:-main}" &
+          if [[ -n "$model_2" ]]; then
+            python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" &
           fi
 
           wait

From ca5ba0969ba9081cd79b0de7ddb04a406026961b Mon Sep 17 00:00:00 2001
From: Christian Kadner <ckadner@us.ibm.com>
Date: Mon, 13 Oct 2025 15:54:52 -0700
Subject: [PATCH 4/7] cache same model revisions as specified in unit tests

Signed-off-by: Christian Kadner <ckadner@us.ibm.com>
---
 .github/workflows/test.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 154f3056c..0a91cd2ce 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,8 +21,7 @@ env:
   VLLM_PLUGINS: "spyre"
   HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"
   DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
-#  DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"
-#  DEFAULT_HF_MODEL_REV: "2714578f54cfb744ece40df9326ee0b47e879e03"
+  DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -51,15 +50,17 @@ jobs:
             markers: "cpu and quantized and multi"
             flags: "--timeout=600 -k 'basic and test_output' --durations=0"
             hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
+            hf_model_rev: "0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e"
           - name: "embedding"
             markers: "cpu and embedding and not quantized"
             flags: "--timeout=300"
             hf_model: "sentence-transformers/all-roberta-large-v1"
-            hf_model_rev: "main"
+            hf_model_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
           - name: "scoring"
             markers: "cpu and scoring"
             flags: "--timeout=300"
             hf_model: "cross-encoder/stsb-roberta-large"
+            hf_model_rev: "2b12c2c0088918e76151fd5937b7bba986ef1f98"
           - name: "continuous batching"
             markers: "cpu and cb and not quantized"
             flags: "--timeout=300  --durations=0 -s"
@@ -84,7 +85,7 @@ jobs:
               markers: "compat or (cpu and basic)"
               flags: "--timeout=300"
               hf_model_2: "sentence-transformers/all-roberta-large-v1"
-              hf_model_2_rev: "main"
+              hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
             os: "ubuntu-latest"
             python_version: "3.12"
         # Exclude vLLM:main if PR does NOT have "ready" label AND auto-merge is not enabled

From b0aa69cc2a949b712b400030e8126986d3b6307e Mon Sep 17 00:00:00 2001
From: Christian Kadner <ckadner@us.ibm.com>
Date: Mon, 13 Oct 2025 17:23:02 -0700
Subject: [PATCH 5/7] set model revision in test prompt logprobs

Signed-off-by: Christian Kadner <ckadner@us.ibm.com>
---
 tests/e2e/test_spyre_prompt_logprobs.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/test_spyre_prompt_logprobs.py b/tests/e2e/test_spyre_prompt_logprobs.py
index 260e1831a..e90e48e2b 100644
--- a/tests/e2e/test_spyre_prompt_logprobs.py
+++ b/tests/e2e/test_spyre_prompt_logprobs.py
@@ -78,7 +78,9 @@ def test_prompt_logprobs_not_supported_with_cb(
     monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1")
 
     with pytest.raises(ValueError, match="continuous batching"):
-        VllmConfig(model_config=ModelConfig(model=model.name, task="generate"))
+        VllmConfig(model_config=ModelConfig(model=model.name,
+                                            revision=model.revision,
+                                            task="generate"))
 
 
 @pytest.mark.skip
@@ -137,7 +139,8 @@ def _get_hf_prompt_logprobs(model_info: ModelInfo, prompts,
     for each token"""
     tokenizer = AutoTokenizer.from_pretrained(model_info.name,
                                               revision=model_info.revision)
-    model = AutoModelForCausalLM.from_pretrained(model_info.name)
+    model = AutoModelForCausalLM.from_pretrained(model_info.name,
+                                                 revision=model_info.revision)
 
     prompt_logprobs = {}
     for prompt in prompts:

From 7c9d12f2058f30b4396b1da42dc9963a4ad2e974 Mon Sep 17 00:00:00 2001
From: Christian Kadner <ckadner@us.ibm.com>
Date: Mon, 13 Oct 2025 17:24:03 -0700
Subject: [PATCH 6/7] YAPF!

Signed-off-by: Christian Kadner <ckadner@us.ibm.com>
---
 tests/e2e/test_spyre_prompt_logprobs.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/e2e/test_spyre_prompt_logprobs.py b/tests/e2e/test_spyre_prompt_logprobs.py
index e90e48e2b..22bc1e2bf 100644
--- a/tests/e2e/test_spyre_prompt_logprobs.py
+++ b/tests/e2e/test_spyre_prompt_logprobs.py
@@ -78,9 +78,8 @@ def test_prompt_logprobs_not_supported_with_cb(
     monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1")
 
     with pytest.raises(ValueError, match="continuous batching"):
-        VllmConfig(model_config=ModelConfig(model=model.name,
-                                            revision=model.revision,
-                                            task="generate"))
+        VllmConfig(model_config=ModelConfig(
+            model=model.name, revision=model.revision, task="generate"))
 
 
 @pytest.mark.skip

From 89c4f46fe2927beeffb65de80c965f8a719331a5 Mon Sep 17 00:00:00 2001
From: Prashant Gupta <prashantgupta@us.ibm.com>
Date: Tue, 14 Oct 2025 16:54:26 -0700
Subject: [PATCH 7/7] get_tokenizer with revision for GTI

Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
Signed-off-by: Christian Kadner <ckadner@us.ibm.com>
---
 tests/golden_token_injector.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/golden_token_injector.py b/tests/golden_token_injector.py
index 126246eff..a85b1eef7 100644
--- a/tests/golden_token_injector.py
+++ b/tests/golden_token_injector.py
@@ -49,7 +49,9 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device,
         # for couple requests that does not have too much impact.
         # But since this is used mostly for validation, it would be fine
         # to keep them.
-        self.tokenizer = get_tokenizer(vllm_config.model_config.tokenizer)
+        self.tokenizer = get_tokenizer(
+            vllm_config.model_config.tokenizer,
+            revision=vllm_config.model_config.revision)
 
     def is_argmax_invariant(self) -> bool:
         """Never impacts greedy sampling"""