fix(core): raise on more than one compatible endpoint-benchmark type combination

prokotg · prokotg · commit e2db2fb297ec · 2025-10-22T12:26:26.000+02:00
Signed-off-by: Tomasz Grzegorzek &lt;tgrzegorzek@nvidia.com&gt;
diff --git a/packages/nemo-evaluator/src/nemo_evaluator/core/input.py b/packages/nemo-evaluator/src/nemo_evaluator/core/input.py
@@ -371,7 +371,12 @@ def check_type_compatibility(evaluation: Evaluation):
                 benchmark_type_combination = [benchmark_type_combination]
 
             if model_types.issuperset(set(benchmark_type_combination)):
-                is_target_compatible = True
+                if is_target_compatible:
+                    raise MisconfigurationError(
+                        f"The benchmark {evaluation.config.type} is compatible with more than one combination of model capabilities {evaluation.target.api_endpoint.type} and needs a specification. Please override model capabilities for this benchmark to match only one combination."
+                    )
+                else:
+                    is_target_compatible = True
 
         if evaluation.target.api_endpoint.type is None:
             raise MisconfigurationError(
diff --git a/packages/nemo-evaluator/tests/unit_tests/core/test_input.py b/packages/nemo-evaluator/tests/unit_tests/core/test_input.py
@@ -88,14 +88,6 @@ def test_empty_dicts():
         ([EndpointType.CHAT, EndpointType.COMPLETIONS], [EndpointType.CHAT]),
         ([EndpointType.CHAT, EndpointType.COMPLETIONS], EndpointType.CHAT),
         (EndpointType.CHAT, [[EndpointType.CHAT], [EndpointType.COMPLETIONS]]),
-        (
-            [EndpointType.CHAT, EndpointType.COMPLETIONS],
-            [EndpointType.CHAT, EndpointType.COMPLETIONS],
-        ),
-        (
-            [EndpointType.CHAT, EndpointType.COMPLETIONS, EndpointType.VLM],
-            [EndpointType.CHAT, EndpointType.COMPLETIONS],
-        ),
         (
             [EndpointType.CHAT, EndpointType.VLM],
             [
@@ -148,3 +140,42 @@ def test_endpoint_type_single_incompatible(model_types, benchmark_types):
         MisconfigurationError, match=r".* does not support the model type .*"
     ):
         check_type_compatibility(evaluation)
+
+
+@pytest.mark.parametrize(
+    "model_types,benchmark_types",
+    [
+        (
+            [EndpointType.CHAT, EndpointType.COMPLETIONS],
+            [EndpointType.CHAT, EndpointType.COMPLETIONS],
+        ),
+        (
+            [EndpointType.CHAT, EndpointType.COMPLETIONS, EndpointType.VLM],
+            [EndpointType.CHAT, EndpointType.COMPLETIONS],
+        ),
+        (
+            [EndpointType.CHAT, EndpointType.COMPLETIONS, EndpointType.VLM],
+            [
+                [EndpointType.COMPLETIONS, EndpointType.VLM],
+                [EndpointType.CHAT, EndpointType.VLM],
+            ],
+        ),
+    ],
+)
+def test_endpoint_type_raise_on_more_than_one(model_types, benchmark_types):
+    evaluation_config = EvaluationConfig(supported_endpoint_types=benchmark_types)
+    target_config = EvaluationTarget(
+        api_endpoint=ApiEndpoint(type=model_types, url="localhost", model_id="my_model")
+    )
+    evaluation = Evaluation(
+        config=evaluation_config,
+        target=target_config,
+        command="",
+        pkg_name="",
+        framework_name="",
+    )
+    with pytest.raises(
+        MisconfigurationError,
+        match=r".* is compatible with more than one combination of model capabilities .*",
+    ):
+        check_type_compatibility(evaluation)