-
Notifications
You must be signed in to change notification settings - Fork 26
[CI] Enable model revisions in GHA test #523
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
e8794ed
d8e6a87
492a627
148741f
ca5ba09
b0aa69c
7c9d12f
89c4f46
91f2c11
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,8 @@ env: | |
VLLM_PLUGINS: "spyre" | ||
HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub" | ||
DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b" | ||
# DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f" | ||
# DEFAULT_HF_MODEL_REV: "2714578f54cfb744ece40df9326ee0b47e879e03" | ||
|
||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | ||
|
@@ -44,19 +46,20 @@ jobs: | |
- name: "static batching" | ||
markers: "cpu and decoder and not cb and not other_e2e and not quantized" | ||
flags: "--timeout=300" | ||
hf_models: "JackFram/llama-160m" | ||
hf_model: "JackFram/llama-160m" | ||
- name: "fp8" | ||
markers: "cpu and quantized and multi" | ||
flags: "--timeout=600 -k 'basic and test_output' --durations=0" | ||
hf_models: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8" | ||
hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8" | ||
- name: "embedding" | ||
markers: "cpu and embedding and not quantized" | ||
flags: "--timeout=300" | ||
hf_models: "sentence-transformers/all-roberta-large-v1" | ||
hf_model: "sentence-transformers/all-roberta-large-v1" | ||
hf_model_rev: "main" | ||
- name: "scoring" | ||
markers: "cpu and scoring" | ||
flags: "--timeout=300" | ||
hf_models: "cross-encoder/stsb-roberta-large" | ||
hf_model: "cross-encoder/stsb-roberta-large" | ||
- name: "continuous batching" | ||
markers: "cpu and cb and not quantized" | ||
flags: "--timeout=300 --durations=0 -s" | ||
|
@@ -80,7 +83,8 @@ jobs: | |
name: "backward compat" | ||
markers: "compat or (cpu and basic)" | ||
flags: "--timeout=300" | ||
hf_models: "micro-g3.3_roberta-large" | ||
hf_model_2: "sentence-transformers/all-roberta-large-v1" | ||
hf_model_2_rev: "main" | ||
os: "ubuntu-latest" | ||
python_version: "3.12" | ||
# Exclude vLLM:main if PR does NOT have "ready" label AND auto-merge is not enabled | ||
|
@@ -150,19 +154,34 @@ jobs: | |
# overwritten. | ||
uv pip install -v . | ||
|
||
# Standardize model name for cache keys | ||
- name: Standardize HF model name | ||
- name: "Standardize HF model names for caching" | ||
id: standardize-names | ||
run: | | ||
model="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}" | ||
if [[ "$model" == "micro-g3.3_roberta-large" ]]; then | ||
echo "model_key=micro-g3.3_roberta-large" >> "$GITHUB_ENV" | ||
echo "model_path=${HF_HUB_CACHE}" >> "$GITHUB_ENV" | ||
# replace '/' characters in HF_MODEL with '--' for GHA cache keys and | ||
# in model file names in local HF hub cache | ||
Comment on lines
+161
to
+162
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A comment on how this looks like would be nice, like what was it earlier and what it looks like after replacing |
||
|
||
# don't use in-line default values for variable expansion here to not | ||
# use the default model revision with a non-default model like this: | ||
# model="${{ matrix.test_suite.hf_model || env.DEFAULT_HF_MODEL }}" | ||
# revision="${{ matrix.test_suite.hf_model_rev || env.DEFAULT_HF_MODEL_REV }}" | ||
|
||
if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then | ||
model="${{ matrix.test_suite.hf_model }}" | ||
revision="${{ matrix.test_suite.hf_model_rev }}" | ||
else | ||
# replace / with -- | ||
safe_name="${model//\//--}" | ||
echo "model_key=$safe_name" >> "$GITHUB_ENV" | ||
echo "model_path=${HF_HUB_CACHE}/models--$safe_name" >> "$GITHUB_ENV" | ||
model="${{ env.DEFAULT_HF_MODEL }}" | ||
revision="${{ env.DEFAULT_HF_MODEL_REV }}" | ||
fi | ||
safe_name="${model//\//--}" | ||
echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV" | ||
echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV" | ||
|
||
if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then | ||
model_2="${{ matrix.test_suite.hf_model_2 }}" | ||
revision_2="${{ matrix.test_suite.hf_model_2_rev}}" | ||
safe_name_2="${model_2//\//--}" | ||
echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV" | ||
echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV" | ||
fi | ||
|
||
- name: "Restore HF models cache" | ||
|
@@ -173,54 +192,40 @@ jobs: | |
path: ${{ env.model_path }} | ||
key: ${{ runner.os }}-hf-model-${{ env.model_key }} | ||
|
||
- name: "Restore HF models cache for additional model" | ||
prashantgupta24 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
id: cache_restore_2 | ||
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 ) | ||
uses: actions/cache/restore@v4 | ||
with: | ||
path: ${{ env.model_2_path }} | ||
key: ${{ runner.os }}-hf-model-${{ env.model_2_key }} | ||
|
||
- name: "Download HF models" | ||
if: ( steps.changed-src-files.outputs.any_changed == 'true' && steps.cache_restore.outputs.cache-hit != 'true' ) | ||
if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' || steps.cache_restore_2.outputs.cache-hit != 'true')) | ||
run: | | ||
# We are caching HF models (HF_HUB_CACHE) for reliability rather than speed, since HF downloads are flaky for concurrent jobs. | ||
# Be careful when adding models to the cache here, as the GHA cache is limited to 10 GB. | ||
# If a new model is added here, a new hash key is generated. The previous cache blob can then | ||
# be removed by an admin or can be left to expire after 7 days. | ||
|
||
download_tinygranite() { | ||
python -c "from transformers import pipeline, AutoTokenizer; pipeline('text-generation', model='$1'); tokenizer=AutoTokenizer.from_pretrained('$1')" | ||
} | ||
download_roberta_large() { | ||
python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('$1')" | ||
} | ||
# tinyllama used for static batching tests because static batching is _too slow_ | ||
download_tinyllama() { | ||
python -c "from transformers import pipeline; pipeline('text-generation', model='$1')" | ||
} | ||
|
||
hf_models="${{ matrix.test_suite.hf_models || env.DEFAULT_HF_MODEL }}" | ||
|
||
if [[ "$hf_models" == "micro-g3.3_roberta-large" ]]; then | ||
models=("ibm-ai-platform/micro-g3.3-8b-instruct-1b" "sentence-transformers/all-roberta-large-v1") | ||
# We are caching HF models (HF_HUB_CACHE) for reliability rather than | ||
# speed, since HF downloads are flaky for concurrent jobs. | ||
# Be careful when adding models to the cache here, as the GHA cache is | ||
# limited to 10 GB. | ||
# If a new model is added here, a new hash key is generated. The | ||
# previous cache blob can then be removed by an admin or can be left | ||
# to expire after 7 days. | ||
|
||
if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then | ||
model="${{ matrix.test_suite.hf_model }}" | ||
revision="${{ matrix.test_suite.hf_model_rev }}" | ||
else | ||
models=("$hf_models") | ||
model="${{ env.DEFAULT_HF_MODEL }}" | ||
revision="${{ env.DEFAULT_HF_MODEL_REV }}" | ||
fi | ||
model_2="${{ matrix.test_suite.hf_model_2 }}" | ||
revision_2="${{ matrix.test_suite.hf_model_2_rev }}" | ||
|
||
for model in "${models[@]}"; do | ||
echo "Downloading $model ..." | ||
case "$model" in | ||
"ibm-ai-platform/micro-g3.3-8b-instruct-1b"*) | ||
download_tinygranite "$model" & | ||
;; | ||
"JackFram/llama-160m") | ||
download_tinyllama "$model" & | ||
;; | ||
"sentence-transformers/all-roberta-large-v1") | ||
download_roberta_large "$model" & | ||
;; | ||
"cross-encoder/stsb-roberta-large") | ||
download_roberta_large "$model" & | ||
;; | ||
*) | ||
echo "No download method found for: $model"; | ||
exit 1 | ||
;; | ||
esac | ||
done | ||
python3 tools/download_model.py -m "$model" -r "${revision:-main}" & | ||
|
||
if [[ -n "$model_2" ]]; then | ||
python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" & | ||
fi | ||
|
||
wait | ||
|
||
|
@@ -231,6 +236,13 @@ jobs: | |
path: ${{ env.model_path }} | ||
key: ${{ runner.os }}-hf-model-${{ env.model_key }} | ||
|
||
- name: "Save HF models cache for additional model" | ||
if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' ) | ||
uses: actions/cache/save@v4 | ||
with: | ||
path: ${{ env.model_2_path }} | ||
key: ${{ runner.os }}-hf-model-${{ env.model_2_key }} | ||
|
||
- name: "Run tests" | ||
if: steps.changed-src-files.outputs.any_changed == 'true' | ||
env: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/usr/bin/env python3 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this tool required if you can use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are using it for the GitHub Action workflow to download models with revisions (unless they are already cached in a GHA cache blob). I recall being told and seeing it in comments, that the HuggingFace CLI is not reliable during Github action runs, though I have never put that to the test myself. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth a shot? I also seem to remember that the HF CLI was downloading something weird at one point, but I don't see that lately. Maybe something got fixed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I tried using the hf download action and it failed in 3 of 10 test jobs: https://github.com/ckadner/vllm-spyre/actions/runs/18510700226/job/52750674898?pr=20
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, [EDIT] There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup. Just reverted. All downloads went through fine: https://github.com/ckadner/vllm-spyre/actions/runs/18510991794/job/52751608834?pr=20 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmmm There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One more time with
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could probably figure out how the space gets used, which files to exclude and/or how to make more space on the GHA runner. But we already have the download script and the code in it has been running fine for several months. So, I vote for keeping the existing custom download code, albeit in a separate script now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, often repositories have extra files such as model weights in other formats. I think the script only downloads the required files, so that would explain that it doesn't run out of disk space. I would prefer less code to maintain, but since we have these space restrictions I guess for now it's better to keep the script. |
||
"""Download a model from HuggingFace with revision. | ||
> python3 tools/download_model.py -m <HF-model-id> [-r <git-tag-or-hash>] | ||
""" | ||
|
||
import argparse | ||
import logging | ||
|
||
|
||
def download_granite_or_llama(model: str, revision: str = "main"): | ||
from transformers import pipeline | ||
pipeline('text-generation', model=model, revision=revision) | ||
|
||
|
||
def download_roberta(model: str, revision: str = "main"): | ||
from sentence_transformers import SentenceTransformer | ||
SentenceTransformer(model, revision=revision) | ||
|
||
|
||
download_methods = { | ||
"ibm-ai-platform/micro-g3.3-8b-instruct-1b": download_granite_or_llama, | ||
"ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8": download_granite_or_llama, | ||
"JackFram/llama-160m": download_granite_or_llama, | ||
"cross-encoder/stsb-roberta-large": download_roberta, | ||
"sentence-transformers/all-roberta-large-v1": download_roberta, | ||
} | ||
|
||
|
||
def download_model_with_revision(model: str, revision: str = "main"): | ||
if model in download_methods: | ||
download_method = download_methods.get(model) | ||
logging.info("Downloading model '%s' with revision '%s' ...", model, | ||
revision) | ||
download_method(model, revision) | ||
logging.info("Model '%s' with revision '%s' downloaded.", model, | ||
revision) | ||
else: | ||
logging.error( | ||
"No `download_method` found for model '%s'." | ||
" Supported models: %s", model, str(list(download_methods.keys()))) | ||
exit(1) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-m", dest="model", help="HuggingFace model ID") | ||
parser.add_argument("-r", | ||
dest="revision", | ||
default="main", | ||
help="Git hash, tag, or branch (default='main')") | ||
args, _extra_args = parser.parse_known_args() | ||
|
||
if args.model: | ||
download_model_with_revision(args.model, args.revision) | ||
else: | ||
logging.error("Need to provide a HuggingFace model ID.") | ||
exit(1) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Uh oh!
There was an error while loading. Please reload this page.