vllm-project · ckadner · Oct 15, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025
@@ -42,3 +42,14 @@ configurations.
 [Granite-Embedding-278m (Multilingual)]: https://huggingface.co/ibm-granite/granite-embedding-278m-multilingual
 [BAAI/BGE-Reranker (v2-m3)]: https://huggingface.co/BAAI/bge-reranker-v2-m3
 [BAAI/BGE-Reranker (Large)]: https://huggingface.co/BAAI/bge-reranker-large
+
+## Runtime Validation
+
+At runtime, the Spyre engine validates the requested model and configurations against the list
+of supported models and configurations based on the entries in the file
+<gh-file:vllm_spyre/config/supported_configurations.yaml>. If a requested model or configuration
+is not found, a warning message will be logged.
+
+```python
+--8<-- "vllm_spyre/config/supported_configurations.yaml:supported-model-runtime-configurations"
+```
@@ -0,0 +1,131 @@
+import logging
+
+import pytest
+import yaml
+from pytest import LogCaptureFixture
+
+from vllm_spyre.config import runtime_config_validator
+from vllm_spyre.config.runtime_config_validator import (
+    validate_runtime_configuration as validate)
+
+
+def setup_log_capture(caplog: LogCaptureFixture, level=logging.INFO):
+    """
+    Setup log capture for the test.
+    """
+    caplog.set_level(level)
+    if caplog.handler not in runtime_config_validator.logger.handlers:
+        runtime_config_validator.logger.addHandler(caplog.handler)
+
+
+@pytest.mark.utils
+@pytest.mark.cpu
+def test_no_eager_validation(monkeypatch, caplog):
+    """
+    Ensure that model runtime config validation is skipped when not running on
+    Spyre cards.
+    """
+    setup_log_capture(caplog, level=logging.INFO)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_SPYRE_DYNAMO_BACKEND", "eager")
+        validate("test/model")
+        assert "validation bypassed" in caplog.text
+
+
+@pytest.mark.utils
+@pytest.mark.cpu
+def test_model_not_supported(monkeypatch, caplog):
+    """
+    Ensure we can run model runtime config validation when (pretending to) run
+    on Spyre cards.
+    """
+    setup_log_capture(caplog, level=logging.INFO)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_SPYRE_DYNAMO_BACKEND", "sendnn")
+        validate("test/model")
+        assert "Model 'test/model' is not supported" in caplog.text
+
+
+@pytest.mark.utils
+@pytest.mark.cpu
+def test_model_runtime_configurations_file_is_valid(monkeypatch, caplog):
+    """
+    Validate that prompts are multiples of 64
+    Validate that prompt + new_tokens <= max_model_len
+    Validate that the batch size is <= a tested upper bound.
+    """
+    setup_log_capture(caplog, level=logging.INFO)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_SPYRE_DYNAMO_BACKEND", "sendnn")
+        validate("test/model")  # ensure configs got loaded
+        mrcs = runtime_config_validator.model_runtime_configs
+        assert len(mrcs) > 0
+        for mrc in mrcs:
+            for c in mrc.configs:
+                assert c.tp_size in [1, 2, 4, 8, 16, 32]
+                if c.cb:
+                    assert c.warmup_shapes is None
+                    assert c.max_model_len % 64 == 0
+                    assert c.max_model_len <= 32 * 1024
+                    assert c.max_num_seqs <= 32
+                else:
+                    assert c.warmup_shapes is not None
+                    for ws in c.warmup_shapes:
+                        assert ws[0] % 64 == 0
+                        assert ws[0] <= 32 * 1024
+                        assert ws[2] in [1, 2, 4, 8, 16, 32, 64]
+
+
+@pytest.mark.utils
+@pytest.mark.cpu
+def test_model_runtime_configurations(monkeypatch, caplog):
+    """
+    Verify that various example model runtime configurations can get validated
+    against a small list of sample configurations.
+    """
+    test_configs = yaml.safe_load("""
+    - model: "test/model"
+      configs: [
+        { cb: True,  tp_size: 1, max_model_len: 1024, max_num_seqs: 16 },
+        { cb: True,  tp_size: 4, max_model_len: 2048, max_num_seqs: 8 },
+        { cb: True,  tp_size: 4, max_model_len: 4096, max_num_seqs: 4 },
+        { cb: True,  tp_size: 4, max_model_len: 8192, max_num_seqs: 2 },
+        { cb: False, tp_size: 1, warmup_shapes: [[64, 20, 4], [128, 20, 2]] },
+        { cb: False, tp_size: 1, warmup_shapes: [[256, 20, 1]] },
+        { cb: False, tp_size: 2, warmup_shapes: [[64, 20, 4]] },
+      ]
+    """)
+    runtime_config_validator.initialize_supported_configurations(test_configs)
+
+    setup_log_capture(caplog, level=logging.INFO)
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_SPYRE_DYNAMO_BACKEND", "sendnn")
+        m.setenv("VLLM_SPYRE_USE_CB", "1")
+        assert validate("test/model", 4, 2048, 8)
+        assert not validate("model/test", 4, 2048, 8)
+        # assert that individual values of a requested config can be less than
+        # the upper bound of a supported config
+        assert validate("test/model", 4, 1024, 8)
+        assert validate("test/model", 4, 2048, 4)
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_SPYRE_DYNAMO_BACKEND", "sendnn")
+        m.setenv("VLLM_SPYRE_USE_CB", "0")
+        assert validate("test/model", 1, warmup_shapes=[[64, 20, 4]])
+        assert validate("test/model", 1, warmup_shapes=[[128, 20, 2]])
+        assert validate("test/model",
+                        1,
+                        warmup_shapes=[[64, 20, 4], [128, 20, 2]])
+        assert validate("test/model",
+                        1,
+                        warmup_shapes=[[128, 20, 2], [64, 20, 4]])
+        assert validate("test/model", 1, warmup_shapes=[[128, 19, 2]])
+        assert validate("test/model", 2, warmup_shapes=[[64, 19, 4]])
+        assert validate("test/model", 2, warmup_shapes=[[64, 19, 2]])
+        assert not validate(
+            "test/model", 2, warmup_shapes=[[64, 20, 4], [128, 20, 2]])
+        assert not validate("test/model",
+                            1,
+                            warmup_shapes=[[64, 20, 4], [128, 20, 2],
+                                           [256, 20, 1]])
@@ -0,0 +1,191 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+import yaml
+from vllm.logger import init_logger
+
+from vllm_spyre import envs as envs_spyre
+
+_config_file = Path(__file__).parent / "supported_configurations.yaml"
+
+logger = init_logger(__name__)
+
+# warmup_shape = [prompt_length, new_tokens, batch_size]
+WarmupShapes = list[tuple[int, int, int]] | list[list[int]]
+
+
+@dataclass(order=True)
+class RuntimeConfiguration:
+    cb: bool = False
+    tp_size: int = 1
+    max_model_len: int = 0
+    max_num_seqs: int = 0
+    warmup_shapes: WarmupShapes | None = field(compare=False, default=None)
+
+    def __post_init__(self):
+        if self.warmup_shapes is not None:
+            self.warmup_shapes = [(ws[0], ws[1], ws[2])
+                                  if isinstance(ws, list) else ws
+                                  for ws in self.warmup_shapes]  # yapf: disable
+
+
+@dataclass
+class ModelRuntimeConfiguration:
+    model: str
+    configs: list[RuntimeConfiguration] | None = None
+    ignore: bool = False
+
+    def __post_init__(self):
+        self.configs = [
+            RuntimeConfiguration(**cfg) if isinstance(cfg, dict) else cfg
+            for cfg in self.configs or []
+        ]
+
+
+model_runtime_configs: list[ModelRuntimeConfiguration] | None = None
+ignored_models: set[str] = set()
+runtime_configs_by_model: dict[str, list[RuntimeConfiguration]]
+
+
+def load_config_yaml() -> list[dict[str, Any]]:
+    with open(_config_file, encoding="utf-8") as f:
+        yaml_data = yaml.safe_load(f)
+    return yaml_data
+
+
+def initialize_supported_configurations(yaml_data: list[dict[str, Any]]):
+    global model_runtime_configs, ignored_models, runtime_configs_by_model
+    model_runtime_configs = [
+        ModelRuntimeConfiguration(**config_dict) for config_dict in yaml_data
+    ]
+    ignored_models = {mrc.model for mrc in model_runtime_configs if mrc.ignore}
+    runtime_configs_by_model = {
+        mrc.model: mrc.configs or []
+        for mrc in model_runtime_configs if not mrc.ignore
+    }
+
+
+def initialize_supported_configurations_from_file():
+    yaml_data = load_config_yaml()
+    initialize_supported_configurations(yaml_data)
+
+
+def validate_runtime_configuration(
+        model: str,
+        tp_size: int = 0,
+        max_model_len: int = 0,
+        max_num_seqs: int = 0,
+        warmup_shapes: WarmupShapes | None = None) -> bool:
+    """
+    Verify if the requested model and configuration is supported by comparing
+    the requested configuration to all the supported configurations.
+    """
+    # we only validate runtime configurations when running on Spyre cards
+    if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND != "sendnn":
+        logger.info(
+            "Model and runtime configuration validation bypassed for"
+            " backend '%s'", envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND)
+        return True
+
+    global model_runtime_configs
+    if model_runtime_configs is None:
+        initialize_supported_configurations_from_file()
+
+    if model in ignored_models:
+        logger.info("Model '%s' is ignored", model)
+        return True
+
+    if model not in runtime_configs_by_model:
+        logger.warning("Model '%s' is not supported", model)
+        return False
+
+    use_cb = envs_spyre.VLLM_SPYRE_USE_CB
+
+    requested_config = RuntimeConfiguration(
+        cb=use_cb,
+        tp_size=tp_size,
+        max_model_len=max_model_len if use_cb else 0,
+        max_num_seqs=max_num_seqs if use_cb else 0,
+        warmup_shapes=warmup_shapes if not use_cb else None)
+
+    supported_configs = runtime_configs_by_model.get(model, [])
+
+    matching_configs: list[RuntimeConfiguration] = list(
+        filter(
+            lambda supported_config: is_requested_config_supported(
+                requested_config=requested_config,
+                supported_config=supported_config),
+            supported_configs,
+        ))
+
+    if len(matching_configs) == 0:
+        logger.warning(
+            "The requested configuration is not supported for"
+            " model '%s': %s", model, str(requested_config))
+        return False
+    else:
+        logger.info(
+            "The requested configuration is supported for"
+            " model '%s': %s", model, str(requested_config))
+        return True
+
+
+def is_requested_config_supported(
+        requested_config: RuntimeConfiguration,
+        supported_config: RuntimeConfiguration) -> bool:
+    """
+    Check if the requested configuration is supported by comparing the requested
+    configuration to all the supported configurations.
+    """
+    # Don't use `if requested_configuration not in supported_configurations:...`
+    # since warmup shapes don't compare easily (excluded from dataclass __eq__)
+    # Instead, use filter here and do a set-compare for warmup_shapes separately
+    return (requested_config.cb == supported_config.cb
+            and requested_config <= supported_config
+            and (requested_config.cb or is_warmup_shapes_supported(
+                requested_config, supported_config)))
+
+
+def is_warmup_shapes_supported(requested_config: RuntimeConfiguration,
+                               supported_config: RuntimeConfiguration) -> bool:
+    """
+    Check if the requested warmup_shapes are a subset of the supported
+    warmup_shapes. If a singular warmup_shape is requested, check
+    if its context length is less than or equal to the context length of a
+    supported warmup_shapes with the same (or larger) batch size.
+    """
+    requested_shapes = requested_config.warmup_shapes or []
+    supported_shapes = supported_config.warmup_shapes or []
+    return (set(requested_shapes).issubset(set(supported_shapes))
+            or is_context_length_supported(requested_shapes, supported_shapes))
+
+
+def is_context_length_supported(requested_shapes: WarmupShapes,
+                                supported_shapes: WarmupShapes) -> bool:
+    """
+    If a singular warmup_shape is requested, check if it's context length is
+    less than or equal to the context length for any of the supported
+    warmup_shapes with the same batch size (or larger supported batch size).
+    (context length = prompt_length + new_tokens)
+    """
+    if len(requested_shapes) > 1:
+        return False
+    request_batch_size = requested_shapes[0][2]
+    supported_shapes_with_matching_batch_size = [(ws[0], ws[1], ws[2])
+                                                 for ws in supported_shapes
+                                                 if request_batch_size <= ws[2]
+                                                 ]
+    return (
+        len(supported_shapes_with_matching_batch_size) > 0 and
+        (get_max_model_length(requested_shapes)
+         <= get_max_model_length(supported_shapes_with_matching_batch_size)))
+
+
+def get_max_model_length(warmup_shapes: WarmupShapes) -> int:
+    """
+    Return the maximum model length from the given warmup shapes.
+    """
+    # max_model_len = prompt_length + new_tokens
+    # warmup_shape = [prompt_length, new_tokens, batch_size]
+    return max([ws[0] + ws[1] for ws in warmup_shapes or []])
diff --git a/vllm_spyre/config/supported_configurations.yaml b/vllm_spyre/config/supported_configurations.yaml
@@ -0,0 +1,57 @@
+# --8<-- [start:supported-model-runtime-configurations]
+
+# Parameters:
+#  - cb: True, for continuous batching; False, for static batching mode
+#  - tp_size: tensor parallel size
+#  - max_model_len: context length (prompt_length + max_new_tokens)
+#  - max_num_seqs: number of sequences in a batch (per instance)
+#  - warmup_shapes: [(fixed_prompt_length, max_new_tokens, batch_size)]
+
+- model: "ibm-granite/granite-3.3-8b-instruct"
+  configs: [
+    { cb: False, tp_size: 1, warmup_shapes: [[2048, 1024, 16]] },
+    { cb: False, tp_size: 4, warmup_shapes: [[6144, 2048,  1]] },
+    { cb: False, tp_size: 4, warmup_shapes: [[7168, 1024,  1]] },
+    { cb: False, tp_size: 4, warmup_shapes: [[7168, 1024,  4]] },
+    { cb: True,  tp_size: 1, max_model_len: 3072,  max_num_seqs: 16 },
+    { cb: True,  tp_size: 1, max_model_len: 8192,  max_num_seqs: 4 },
+    { cb: True,  tp_size: 2, max_model_len: 8192,  max_num_seqs: 4 },
+    { cb: True,  tp_size: 4, max_model_len: 8192,  max_num_seqs: 4 },
+    { cb: True,  tp_size: 4, max_model_len: 16384, max_num_seqs: 4 },
+    { cb: True,  tp_size: 4, max_model_len: 32768, max_num_seqs: 32 },
+  ]
+- model: "ibm-granite/granite-3.3-8b-instruct-FP8"
+  configs: [
+    { cb: True, tp_size: 1, max_model_len: 3072,  max_num_seqs: 16 },
+    { cb: True, tp_size: 4, max_model_len: 8192,  max_num_seqs: 4 },
+    { cb: True, tp_size: 4, max_model_len: 16384, max_num_seqs: 4 },
+    { cb: True, tp_size: 4, max_model_len: 32768, max_num_seqs: 32 },
+  ]
+- model: "ibm-granite/granite-embedding-125m-english"
+  configs: [
+    { cb: False, tp_size: 1, warmup_shapes: [[512, 0, 1]] },
+    { cb: False, tp_size: 1, warmup_shapes: [[512, 0, 64]] },
+  ]
+- model: "ibm-granite/granite-embedding-278m-multilingual"
+  configs: [
+    { cb: False, tp_size: 1, warmup_shapes: [[512, 0, 1]] },
+    { cb: False, tp_size: 1, warmup_shapes: [[512, 0, 64]] },
+  ]
+- model: "BAAI/bge-reranker-v2-m3"
+  configs: [
+    { cb: False, tp_size: 1, warmup_shapes: [[8192, 0, 1]] },
+  ]
+- model: "BAAI/bge-reranker-large"
+  configs: [
+    { cb: False, tp_size: 1, warmup_shapes: [[512, 0, 1]] },
+    { cb: False, tp_size: 1, warmup_shapes: [[512, 0, 64]] },
+  ]
+- model: "sentence-transformers/all-roberta-large-v1"
+  configs: [
+    { cb: False, tp_size: 1, warmup_shapes: [[64, 0, 4], [64, 0, 8], [128, 0, 4], [128, 0, 8]] },
+  ]
+# --8<-- [end:supported-model-runtime-configurations]
+- model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
+  ignore: True
+- model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
+  ignore: True