diff --git a/src/eval/inference/hpo_search_baselines.py b/src/eval/inference/hpo_search_baselines.py index 8f9bbb7..de90a39 100644 --- a/src/eval/inference/hpo_search_baselines.py +++ b/src/eval/inference/hpo_search_baselines.py @@ -359,12 +359,14 @@ def primary_metric(metrics: dict[str, Any], scenario: str) -> tuple[float, str | def gate_passed(metrics: dict[str, Any]) -> bool: + if metrics.get("error"): + return False if metrics.get("score") == 0: return False - qc = metrics.get("quality_check") or {} - if qc.get("pass") is False: + qc = metrics.get("quality_check") + if not isinstance(qc, dict): return False - return True + return qc.get("pass") is True def _run_text(cmd: list[str], timeout_s: float = 20.0) -> str: diff --git a/tests/test_hpo_search_baselines.py b/tests/test_hpo_search_baselines.py index 9cd01e3..c275bd3 100644 --- a/tests/test_hpo_search_baselines.py +++ b/tests/test_hpo_search_baselines.py @@ -142,3 +142,15 @@ def test_gate_passed_ignores_legacy_success_gate_field(): def test_gate_passed_still_fails_quality_and_zero_score(): assert not hpo.gate_passed({"score": 0, "quality_check": {"pass": True}}) assert not hpo.gate_passed({"quality_check": {"pass": False}}) + + +def test_gate_passed_requires_explicit_successful_quality_check(): + assert hpo.gate_passed({"quality_check": {"pass": True}}) + assert not hpo.gate_passed({}) + assert not hpo.gate_passed({"quality_check": {}}) + assert not hpo.gate_passed( + { + "error": "evaluate.py exited with code 1", + "quality_check": {"pass": True}, + } + )