feat(core): Multiple endpoint types on benchmark and endpoint end

prokotg · prokotg · commit 72a316d648f3 · 2025-10-22T12:26:26.000+02:00
Signed-off-by: Tomasz Grzegorzek &lt;tgrzegorzek@nvidia.com&gt;
diff --git a/packages/nemo-evaluator/src/nemo_evaluator/api/api_dataclasses.py b/packages/nemo-evaluator/src/nemo_evaluator/api/api_dataclasses.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 from enum import Enum
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
 
 import jinja2
 from pydantic import BaseModel, ConfigDict, Field
@@ -48,7 +48,7 @@ class ApiEndpoint(BaseModel):
     stream: Optional[bool] = Field(
         description="Whether responses should be streamed", default=None
     )
-    type: Optional[EndpointType] = Field(
+    type: Optional[Union[EndpointType, list[EndpointType]]] = Field(
         description="The type of the target", default=None
     )
     url: Optional[str] = Field(description="Url of the model", default=None)
@@ -108,7 +108,9 @@ class EvaluationConfig(BaseModel):
     params: Optional[ConfigParams] = Field(
         description="Parameters to be used for evaluation", default=None
     )
-    supported_endpoint_types: Optional[list[str]] = Field(
+    supported_endpoint_types: Optional[
+        Union[EndpointType, list[Union[EndpointType, list[EndpointType]]]]
+    ] = Field(
         description="Supported endpoint types like chat or completions", default=None
     )
     type: Optional[str] = Field(description="Type of the task", default=None)
diff --git a/packages/nemo-evaluator/src/nemo_evaluator/core/input.py b/packages/nemo-evaluator/src/nemo_evaluator/core/input.py
@@ -348,20 +348,38 @@ def get_evaluation(
 
 
 def check_type_compatibility(evaluation: Evaluation):
-    if (
-        evaluation.config.supported_endpoint_types is not None
-        and evaluation.target.api_endpoint.type
-        not in evaluation.config.supported_endpoint_types
-    ):
+    # Model endpoint types must be checked against benchmark required capabilities.
+    # All benchmark required capabilities must be present in model endpoint types.
+
+    # If the evaluation does not specify particular endpoint types,
+    # we treat it as 'any's
+
+    # We have to be carefull in terms of types. We might run into turning a stringable
+    # dataclass into a set
+    if evaluation.config.supported_endpoint_types is not None:
+        if not isinstance(evaluation.target.api_endpoint.type, list):
+            evaluation.target.api_endpoint.type = [evaluation.target.api_endpoint.type]
+
+        if not isinstance(evaluation.config.supported_endpoint_types, list):
+            evaluation.config.supported_endpoint_types = [
+                evaluation.config.supported_endpoint_types
+            ]
+        model_types = set(evaluation.target.api_endpoint.type)
+        is_target_compatible = False
+        for benchmark_type_combination in evaluation.config.supported_endpoint_types:
+            if not isinstance(benchmark_type_combination, list):
+                benchmark_type_combination = [benchmark_type_combination]
+
+            if model_types.issuperset(set(benchmark_type_combination)):
+                is_target_compatible = True
+
         if evaluation.target.api_endpoint.type is None:
             raise MisconfigurationError(
                 "target.api_endpoint.type should be defined and match one of the endpoint "
                 f"types supported by the benchmark: '{evaluation.config.supported_endpoint_types}'",
             )
-        if (
-            evaluation.target.api_endpoint.type
-            not in evaluation.config.supported_endpoint_types
-        ):
+
+        if not is_target_compatible:
             raise MisconfigurationError(
                 f"The benchmark '{evaluation.config.type}' does not support the model type '{evaluation.target.api_endpoint.type}'. "
                 f"The benchmark supports '{evaluation.config.supported_endpoint_types}'."
diff --git a/packages/nemo-evaluator/tests/unit_tests/core/test_input.py b/packages/nemo-evaluator/tests/unit_tests/core/test_input.py
@@ -14,7 +14,17 @@
 # limitations under the License.
 
 
-from nemo_evaluator.core.input import merge_dicts
+import pytest
+
+from nemo_evaluator.api.api_dataclasses import (
+    ApiEndpoint,
+    EndpointType,
+    Evaluation,
+    EvaluationConfig,
+    EvaluationTarget,
+)
+from nemo_evaluator.core.input import check_type_compatibility, merge_dicts
+from nemo_evaluator.core.utils import MisconfigurationError
 
 
 def test_distinct_keys():
@@ -64,3 +74,77 @@ def test_empty_dicts():
     d2 = {}
     assert merge_dicts(d1, d2) == {"b": 2}
     assert merge_dicts({}, {}) == {}
+
+
+@pytest.mark.parametrize(
+    "model_types,benchmark_types",
+    [
+        (EndpointType.CHAT, EndpointType.CHAT),
+        ([EndpointType.CHAT], [EndpointType.CHAT]),
+        (EndpointType.CHAT, [EndpointType.CHAT]),
+        ([EndpointType.CHAT], EndpointType.CHAT),
+        ("chat", "chat"),
+        ("chat", None),
+        ([EndpointType.CHAT, EndpointType.COMPLETIONS], [EndpointType.CHAT]),
+        ([EndpointType.CHAT, EndpointType.COMPLETIONS], EndpointType.CHAT),
+        (EndpointType.CHAT, [[EndpointType.CHAT], [EndpointType.COMPLETIONS]]),
+        (
+            [EndpointType.CHAT, EndpointType.COMPLETIONS],
+            [EndpointType.CHAT, EndpointType.COMPLETIONS],
+        ),
+        (
+            [EndpointType.CHAT, EndpointType.COMPLETIONS, EndpointType.VLM],
+            [EndpointType.CHAT, EndpointType.COMPLETIONS],
+        ),
+        (
+            [EndpointType.CHAT, EndpointType.VLM],
+            [
+                [EndpointType.COMPLETIONS, EndpointType.VLM],
+                [EndpointType.CHAT, EndpointType.VLM],
+            ],
+        ),
+    ],
+)
+def test_endpoint_type_single_compatible(model_types, benchmark_types):
+    evaluation_config = EvaluationConfig(supported_endpoint_types=benchmark_types)
+    target_config = EvaluationTarget(
+        api_endpoint=ApiEndpoint(type=model_types, url="localhost", model_id="my_model")
+    )
+    evaluation = Evaluation(
+        config=evaluation_config,
+        target=target_config,
+        command="",
+        pkg_name="",
+        framework_name="",
+    )
+    check_type_compatibility(evaluation)
+
+
+@pytest.mark.parametrize(
+    "model_types,benchmark_types",
+    [
+        (EndpointType.CHAT, EndpointType.COMPLETIONS),
+        ("chat", "vlm"),
+        ([EndpointType.CHAT], [[EndpointType.CHAT, EndpointType.VLM]]),
+        (
+            [EndpointType.CHAT, EndpointType.VLM],
+            [[EndpointType.COMPLETIONS, EndpointType.VLM]],
+        ),
+    ],
+)
+def test_endpoint_type_single_incompatible(model_types, benchmark_types):
+    evaluation_config = EvaluationConfig(supported_endpoint_types=benchmark_types)
+    target_config = EvaluationTarget(
+        api_endpoint=ApiEndpoint(type=model_types, url="localhost", model_id="my_model")
+    )
+    evaluation = Evaluation(
+        config=evaluation_config,
+        target=target_config,
+        command="",
+        pkg_name="",
+        framework_name="",
+    )
+    with pytest.raises(
+        MisconfigurationError, match=r".* does not support the model type .*"
+    ):
+        check_type_compatibility(evaluation)