clean up imports, type hints and docs

elisno · elisno · commit b4713712299a · 2025-03-25T16:48:39.000-07:00
diff --git a/src/cleanlab_codex/internal/validator.py b/src/cleanlab_codex/internal/validator.py
@@ -1,19 +1,12 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional, Sequence, cast
 
-from cleanlab_codex.utils.errors import MissingDependencyError
+from cleanlab_tlm.utils.rag import Eval, TrustworthyRAGScore, get_default_evals
 
-try:
-    from cleanlab_tlm.utils.rag import Eval, TrustworthyRAGScore, get_default_evals
-except ImportError as e:
-    raise MissingDependencyError(
-        import_name=e.name or "cleanlab-tlm",
-        package_url="https://github.com/cleanlab/cleanlab-tlm",
-    ) from e
+from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore
 
 if TYPE_CHECKING:
-    from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore
     from cleanlab_codex.validator import BadResponseThresholds
 
 
@@ -40,26 +33,21 @@ def get_default_trustworthyrag_config() -> dict[str, Any]:
 
 
 def update_scores_based_on_thresholds(
-    scores: ThresholdedTrustworthyRAGScore, thresholds: BadResponseThresholds
-) -> None:
+    scores: TrustworthyRAGScore | Sequence[TrustworthyRAGScore], thresholds: BadResponseThresholds
+) -> ThresholdedTrustworthyRAGScore:
     """Adds a `is_bad` flag to the scores dictionaries based on the thresholds."""
-    for eval_name, score_dict in scores.items():
-        score_dict.setdefault("is_bad", False)
-        if (score := score_dict["score"]) is not None:
-            score_dict["is_bad"] = score < thresholds.get_threshold(eval_name)
 
+    # Helper function to check if a score is bad
+    def is_bad(score: Optional[float], threshold: float) -> bool:
+        return score is not None and score < threshold
 
-def is_bad_response(
-    scores: TrustworthyRAGScore | ThresholdedTrustworthyRAGScore,
-    thresholds: BadResponseThresholds,
-) -> bool:
-    """
-    Check if the response is bad based on the scores computed by TrustworthyRAG and the config containing thresholds.
-    """
-    for eval_metric, score_dict in scores.items():
-        score = score_dict["score"]
-        if score is None:
-            continue
-        if score < thresholds.get_threshold(eval_metric):
-            return True
-    return False
+    if isinstance(scores, Sequence):
+        raise NotImplementedError("Batching is not supported yet.")
+
+    thresholded_scores = {}
+    for eval_name, score_dict in scores.items():
+        thresholded_scores[eval_name] = {
+            **score_dict,
+            "is_bad": is_bad(score_dict["score"], thresholds.get_threshold(eval_name)),
+        }
+    return cast(ThresholdedTrustworthyRAGScore, thresholded_scores)
diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py
@@ -4,27 +4,20 @@
 
 from __future__ import annotations
 
-from typing import Any, Callable, Optional, cast
+from typing import TYPE_CHECKING, Any, Callable, Optional, cast
 
+from cleanlab_tlm import TrustworthyRAG
 from pydantic import BaseModel, Field, field_validator
 
 from cleanlab_codex.internal.validator import (
     get_default_evaluations,
     get_default_trustworthyrag_config,
 )
-from cleanlab_codex.internal.validator import is_bad_response as _is_bad_response
 from cleanlab_codex.internal.validator import update_scores_based_on_thresholds as _update_scores_based_on_thresholds
 from cleanlab_codex.project import Project
-from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore
-from cleanlab_codex.utils.errors import MissingDependencyError
 
-try:
-    from cleanlab_tlm import TrustworthyRAG
-except ImportError as e:
-    raise MissingDependencyError(
-        import_name=e.name or "cleanlab-tlm",
-        package_url="https://github.com/cleanlab/cleanlab-tlm",
-    ) from e
+if TYPE_CHECKING:
+    from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore
 
 
 class BadResponseThresholds(BaseModel):
@@ -141,8 +134,8 @@ def validate(
 
         Returns:
             dict[str, Any]: A dictionary containing:
-                - 'is_bad_response': True if the response is flagged as potentially bad, False otherwise.
-                - 'expert_answer': Alternate SME-provided answer from Codex, or None if no answer could be found in the Codex Project.
+                - 'expert_answer': Alternate SME-provided answer from Codex if the response was flagged as bad and an answer was found, or None otherwise.
+                - 'is_bad_response': True if the response is flagged as potentially bad (when True, a lookup in Codex is performed), False otherwise.
                 - Additional keys: Various keys from a [`ThresholdedTrustworthyRAGScore`](/cleanlab_codex/types/validator/#class-thresholdedtrustworthyragscore) dictionary, with raw scores from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) for each evaluation metric.  `is_bad` indicating whether the score is below the threshold.
         """
         scores, is_bad_response = self.detect(query, context, response, prompt, form_prompt)
@@ -164,7 +157,7 @@ def detect(
         prompt: Optional[str] = None,
         form_prompt: Optional[Callable[[str, str], str]] = None,
     ) -> tuple[ThresholdedTrustworthyRAGScore, bool]:
-        """Evaluate the response quality using TrustworthyRAG and determine if it is a bad response.
+        """Evaluate the response quality using TrustworthyRAG and determine if it is a bad response via thresholding.
 
         Args:
             query (str): The user query that was used to generate the response.
@@ -178,21 +171,21 @@ def detect(
                 - bool: True if the response is determined to be bad based on the evaluation scores
                   and configured thresholds, False otherwise.
         """
-        scores = cast(
-            ThresholdedTrustworthyRAGScore,
-            self._tlm_rag.score(
-                response=response,
-                query=query,
-                context=context,
-                prompt=prompt,
-                form_prompt=form_prompt,
-            ),
+        scores = self._tlm_rag.score(
+            response=response,
+            query=query,
+            context=context,
+            prompt=prompt,
+            form_prompt=form_prompt,
         )
 
-        _update_scores_based_on_thresholds(scores, thresholds=self._bad_response_thresholds)
+        thresholded_scores = _update_scores_based_on_thresholds(
+            scores=scores,
+            thresholds=self._bad_response_thresholds,
+        )
 
-        is_bad_response = _is_bad_response(scores, self._bad_response_thresholds)
-        return scores, is_bad_response
+        is_bad_response = any(score_dict["is_bad"] for score_dict in thresholded_scores.values())
+        return thresholded_scores, is_bad_response
 
     def remediate(self, query: str) -> str | None:
         """Request a SME-provided answer for this query, if one is available in Codex.
diff --git a/tests/internal/test_validator.py b/tests/internal/test_validator.py
@@ -1,9 +1,8 @@
 from typing import cast
 
-import pytest
 from cleanlab_tlm.utils.rag import TrustworthyRAGScore
 
-from cleanlab_codex.internal.validator import get_default_evaluations, is_bad_response
+from cleanlab_codex.internal.validator import get_default_evaluations
 from cleanlab_codex.validator import BadResponseThresholds
 
 
@@ -28,26 +27,3 @@ def make_is_bad_response_config(trustworthiness: float, response_helpfulness: fl
 
 def test_get_default_evaluations() -> None:
     assert {evaluation.name for evaluation in get_default_evaluations()} == {"response_helpfulness"}
-
-
-class TestIsBadResponse:
-    @pytest.fixture
-    def scores(self) -> TrustworthyRAGScore:
-        return make_scores(0.92, 0.75)
-
-    @pytest.fixture
-    def custom_is_bad_response_config(self) -> BadResponseThresholds:
-        return make_is_bad_response_config(0.6, 0.7)
-
-    def test_thresholds(self, scores: TrustworthyRAGScore) -> None:
-        # High trustworthiness_threshold
-        is_bad_response_config = make_is_bad_response_config(0.921, 0.5)
-        assert is_bad_response(scores, is_bad_response_config)
-
-        # High response_helpfulness_threshold
-        is_bad_response_config = make_is_bad_response_config(0.5, 0.751)
-        assert is_bad_response(scores, is_bad_response_config)
-
-    def test_scores(self, custom_is_bad_response_config: BadResponseThresholds) -> None:
-        scores = make_scores(0.59, 0.7)
-        assert is_bad_response(scores, custom_is_bad_response_config)