From 1bc7370a7589755c556fdecad910d3bc7d600f85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Thu, 20 Mar 2025 21:56:27 +0000 Subject: [PATCH 01/35] add cleanlab-tlm as a dependency in pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 8fc930e..6b36862 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ + "cleanlab-tlm>=1.0.12", "codex-sdk==0.1.0a12", "pydantic>=2.0.0, <3", ] From 2529ae669d5d574565ae1549135a7822afae52b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Thu, 20 Mar 2025 23:06:49 +0000 Subject: [PATCH 02/35] Add response validation functionality using TrustworthyRAG --- src/cleanlab_codex/__init__.py | 3 +- src/cleanlab_codex/internal/validator.py | 86 +++++++++++++++++ src/cleanlab_codex/validator.py | 113 +++++++++++++++++++++++ tests/internal/test_validator.py | 55 +++++++++++ 4 files changed, 256 insertions(+), 1 deletion(-) create mode 100644 src/cleanlab_codex/internal/validator.py create mode 100644 src/cleanlab_codex/validator.py create mode 100644 tests/internal/test_validator.py diff --git a/src/cleanlab_codex/__init__.py b/src/cleanlab_codex/__init__.py index d1b8ef6..572a626 100644 --- a/src/cleanlab_codex/__init__.py +++ b/src/cleanlab_codex/__init__.py @@ -2,5 +2,6 @@ from cleanlab_codex.client import Client from cleanlab_codex.codex_tool import CodexTool from cleanlab_codex.project import Project +from cleanlab_codex.validator import Validator -__all__ = ["Client", "CodexTool", "Project"] +__all__ = ["Client", "CodexTool", "Project", "Validator"] diff --git a/src/cleanlab_codex/internal/validator.py b/src/cleanlab_codex/internal/validator.py new file mode 100644 index 0000000..ff2fd7f --- /dev/null +++ b/src/cleanlab_codex/internal/validator.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + +from cleanlab_codex.utils.errors import MissingDependencyError + +try: + from cleanlab_tlm.utils.rag import Eval, TrustworthyRAGScore, get_default_evals +except ImportError as e: + raise MissingDependencyError( + import_name=e.name or "cleanlab-tlm", + package_url="https://github.com/cleanlab/cleanlab-tlm", + ) from e + + +"""Evaluation metrics (excluding trustworthiness) that are used to determine if a response is bad.""" +EVAL_METRICS = ["response_helpfulness"] + +"""Evaluation metrics that are used to determine if a response is bad.""" +BAD_RESPONSE_EVAL_METRICS = ["trustworthiness", *EVAL_METRICS] + + +class IsBadResponseConfig(BaseModel): + """Config for determining if a response is bad. + Each key is an evaluation metric and the value is a threshold such that if the score is below the threshold, the response is bad. + """ + + trustworthiness: float = Field( + description="Threshold for trustworthiness. If the score is below this threshold, the response is bad.", + default=0.5, + ge=0, + le=1, + ) + response_helpfulness: float = Field( + description="Threshold for response helpfulness. If the score is below this threshold, the response is bad.", + default=0.5, + ge=0, + le=1, + ) + + +def get_default_evaluations() -> list[Eval]: + """Get the default evaluations for the TrustworthyRAG. + + Note: + This excludes trustworthiness, which is automatically computed by TrustworthyRAG. + """ + return [evaluation for evaluation in get_default_evals() if evaluation.name in EVAL_METRICS] + + +DEFAULT_IS_BAD_RESPONSE_CONFIG: IsBadResponseConfig = IsBadResponseConfig( + trustworthiness=0.5, + response_helpfulness=0.5, +) + + +DEFAULT_TRUSTWORTHYRAG_CONFIG = { + "options": { + "log": ["explanation"], + }, +} + + +def get_default_trustworthyrag_config() -> dict[str, Any]: + """Get the default configuration for the TrustworthyRAG.""" + return DEFAULT_TRUSTWORTHYRAG_CONFIG + + +def is_bad_response(scores: TrustworthyRAGScore, is_bad_response_config: IsBadResponseConfig | None = None) -> bool: + """ + Check if the response is bad based on the scores computed by TrustworthyRAG and the config containing thresholds. + """ + is_bad_response_config_dict: dict[str, float] = IsBadResponseConfig.model_validate( + is_bad_response_config or DEFAULT_IS_BAD_RESPONSE_CONFIG + ).model_dump() + for eval_metric in BAD_RESPONSE_EVAL_METRICS: + score = scores[eval_metric]["score"] + if score is None: + error_msg = f"Score for {eval_metric} is None" + raise ValueError(error_msg) + threshold = is_bad_response_config_dict[eval_metric] + if score < threshold: + return True + return False diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py new file mode 100644 index 0000000..7949c16 --- /dev/null +++ b/src/cleanlab_codex/validator.py @@ -0,0 +1,113 @@ +""" +Leverage Cleanlab's Evals and Codex to detect and remediate bad responses in RAG applications. +""" + +from __future__ import annotations + +from typing import Any, Optional, cast + +from cleanlab_codex.internal.validator import ( + IsBadResponseConfig, + get_default_evaluations, + get_default_trustworthyrag_config, +) +from cleanlab_codex.internal.validator import is_bad_response as _is_bad_response +from cleanlab_codex.project import Project +from cleanlab_codex.utils.errors import MissingDependencyError + +try: + from cleanlab_tlm import TrustworthyRAG + from cleanlab_tlm.utils.rag import TrustworthyRAGScore +except ImportError as e: + raise MissingDependencyError( + import_name=e.name or "cleanlab-tlm", + package_url="https://github.com/cleanlab/cleanlab-tlm", + ) from e + + +class Validator: + def __init__( + self, + codex_access_key: str, + tlm_api_key: Optional[str] = None, + trustworthy_rag_config: Optional[dict[str, Any]] = None, + is_bad_response_config: Optional[dict[str, float]] = None, + ): + """Evaluates the quality of responses generated in RAG applications and remediates them if needed. + + This object combines Cleanlab's various Evals with thresholding to detect bad responses and remediates them with Codex. + + Args: + codex_access_key (str): The [access key](/codex/web_tutorials/create_project/#access-keys) for a Codex project. + tlm_api_key (Optional[str]): The API key for [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag). + trustworthy_rag_config (Optional[dict[str, Any]]): The constructor arguments for [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag). + is_bad_response_config (Optional[dict[str, float]]): The configuration for determining if a response is bad. + """ + trustworthy_rag_config = trustworthy_rag_config or get_default_trustworthyrag_config() + if tlm_api_key is not None: + trustworthy_rag_config["api_key"] = tlm_api_key + self._is_bad_response_config = IsBadResponseConfig.model_validate(is_bad_response_config or {}) + + self._project: Project = Project.from_access_key(access_key=codex_access_key) + + trustworthy_rag_config.setdefault("evals", get_default_evaluations()) + self._tlm_rag = TrustworthyRAG(**trustworthy_rag_config) + + def validate(self, query: str, context: str, response: str) -> dict[str, Any]: + """Validate the response quality and generate an alternative response if needed. + + Args: + query (str): The user's original query. + context (str): The context provided to generate the response. + response (str): The response to evaluate. + + Returns: + dict[str, Any]: A dictionary containing: + - 'is_bad_response': True if the response is determined to be bad, False otherwise. + - 'alt_answer': The alternative response from Codex, or None if no response could be fetched from Codex. + - Other evaluation metrics from TrustworthyRAG. + """ + scores, is_bad_response = self.detect(query, context, response) + alt_answer = None + if is_bad_response: + alt_answer = self.remediate(query) + + return { + "is_bad_response": is_bad_response, + "alt_answer": alt_answer, + **scores, + } + + def detect(self, query: str, context: str, response: str) -> tuple[TrustworthyRAGScore, bool]: + """Evaluate the response quality using TrustworthyRAG and determine if it is a bad response. + + Args: + query (str): The user's original query. + context (str): The context provided to generate the response. + response (str): The response to evaluate. + + Returns: + tuple[TrustworthyRAGScore, bool]: A tuple containing: + - TrustworthyRAGScore: Quality scores for different evaluation metrics like trustworthiness + and response helpfulness. Each metric has a score between 0-1. + - bool: True if the response is determined to be bad based on the evaluation scores + and configured thresholds, False otherwise. + """ + scores = cast(TrustworthyRAGScore, self._tlm_rag.score(response=response, query=query, context=context)) + _config = ( + IsBadResponseConfig.model_validate(self._is_bad_response_config) if self._is_bad_response_config else None + ) + is_bad_response = _is_bad_response(scores, _config) + return scores, is_bad_response + + def remediate(self, query: str) -> str | None: + """Queries Codex to get an alternative response when the original response is determined to be bad. + + Args: + query (str): The user's original query to get an alternative response for. + + Returns: + str | None: The alternative response from Codex, or None if no response could be fetched from Codex. + """ + codex_answer, _ = self._project.query(question=query) + return codex_answer diff --git a/tests/internal/test_validator.py b/tests/internal/test_validator.py new file mode 100644 index 0000000..d3ce3ab --- /dev/null +++ b/tests/internal/test_validator.py @@ -0,0 +1,55 @@ +from typing import cast + +import pytest +from cleanlab_tlm.utils.rag import TrustworthyRAGScore + +from cleanlab_codex.internal.validator import IsBadResponseConfig, get_default_evaluations, is_bad_response + + +def make_scores(trustworthiness: float, response_helpfulness: float) -> TrustworthyRAGScore: + scores = { + "trustworthiness": { + "score": trustworthiness, + }, + "response_helpfulness": { + "score": response_helpfulness, + }, + } + return cast(TrustworthyRAGScore, scores) + + +def make_is_bad_response_config(trustworthiness: float, response_helpfulness: float) -> IsBadResponseConfig: + return IsBadResponseConfig( + trustworthiness=trustworthiness, + response_helpfulness=response_helpfulness, + ) + + +def test_get_default_evaluations() -> None: + assert {evaluation.name for evaluation in get_default_evaluations()} == {"response_helpfulness"} + + +class TestIsBadResponse: + @pytest.fixture + def scores(self) -> TrustworthyRAGScore: + return make_scores(0.92, 0.75) + + @pytest.fixture + def custom_is_bad_response_config(self) -> IsBadResponseConfig: + return make_is_bad_response_config(0.6, 0.7) + + def test_thresholds(self, scores: TrustworthyRAGScore) -> None: + default_is_bad_response = is_bad_response(scores) + assert not default_is_bad_response + + # High trustworthiness_threshold + is_bad_response_config = make_is_bad_response_config(0.921, 0.5) + assert is_bad_response(scores, is_bad_response_config) + + # High response_helpfulness_threshold + is_bad_response_config = make_is_bad_response_config(0.5, 0.751) + assert is_bad_response(scores, is_bad_response_config) + + def test_scores(self, custom_is_bad_response_config: IsBadResponseConfig) -> None: + scores = make_scores(0.59, 0.7) + assert is_bad_response(scores, custom_is_bad_response_config) From 722d287cae778a2a350e651b9fc93cd3f6062706 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Fri, 21 Mar 2025 00:57:10 +0000 Subject: [PATCH 03/35] alt_answer -> expert_answer --- src/cleanlab_codex/validator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 7949c16..5074341 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -64,17 +64,17 @@ def validate(self, query: str, context: str, response: str) -> dict[str, Any]: Returns: dict[str, Any]: A dictionary containing: - 'is_bad_response': True if the response is determined to be bad, False otherwise. - - 'alt_answer': The alternative response from Codex, or None if no response could be fetched from Codex. + - 'expert_answer': The alternative response from Codex, or None if no response could be fetched from Codex. - Other evaluation metrics from TrustworthyRAG. """ scores, is_bad_response = self.detect(query, context, response) - alt_answer = None + expert_answer = None if is_bad_response: - alt_answer = self.remediate(query) + expert_answer = self.remediate(query) return { "is_bad_response": is_bad_response, - "alt_answer": alt_answer, + "expert_answer": expert_answer, **scores, } From 6f64a127b5e1b87834ceb3b7eda59d357bb9d800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Fri, 21 Mar 2025 14:10:37 +0000 Subject: [PATCH 04/35] address comments --- src/cleanlab_codex/internal/validator.py | 23 +++++--------------- src/cleanlab_codex/validator.py | 27 ++++++++++++------------ tests/internal/test_validator.py | 10 ++++----- 3 files changed, 23 insertions(+), 37 deletions(-) diff --git a/src/cleanlab_codex/internal/validator.py b/src/cleanlab_codex/internal/validator.py index ff2fd7f..09b91b9 100644 --- a/src/cleanlab_codex/internal/validator.py +++ b/src/cleanlab_codex/internal/validator.py @@ -22,7 +22,8 @@ BAD_RESPONSE_EVAL_METRICS = ["trustworthiness", *EVAL_METRICS] -class IsBadResponseConfig(BaseModel): + +class BadResponseThresholds(BaseModel): """Config for determining if a response is bad. Each key is an evaluation metric and the value is a threshold such that if the score is below the threshold, the response is bad. """ @@ -50,12 +51,6 @@ def get_default_evaluations() -> list[Eval]: return [evaluation for evaluation in get_default_evals() if evaluation.name in EVAL_METRICS] -DEFAULT_IS_BAD_RESPONSE_CONFIG: IsBadResponseConfig = IsBadResponseConfig( - trustworthiness=0.5, - response_helpfulness=0.5, -) - - DEFAULT_TRUSTWORTHYRAG_CONFIG = { "options": { "log": ["explanation"], @@ -68,19 +63,11 @@ def get_default_trustworthyrag_config() -> dict[str, Any]: return DEFAULT_TRUSTWORTHYRAG_CONFIG -def is_bad_response(scores: TrustworthyRAGScore, is_bad_response_config: IsBadResponseConfig | None = None) -> bool: +def is_bad_response(scores: TrustworthyRAGScore, thresholds: dict[str, float]) -> bool: """ Check if the response is bad based on the scores computed by TrustworthyRAG and the config containing thresholds. """ - is_bad_response_config_dict: dict[str, float] = IsBadResponseConfig.model_validate( - is_bad_response_config or DEFAULT_IS_BAD_RESPONSE_CONFIG - ).model_dump() - for eval_metric in BAD_RESPONSE_EVAL_METRICS: - score = scores[eval_metric]["score"] - if score is None: - error_msg = f"Score for {eval_metric} is None" - raise ValueError(error_msg) - threshold = is_bad_response_config_dict[eval_metric] - if score < threshold: + for eval_metric, threshold in thresholds.items(): + if scores[eval_metric]["score"] < threshold: return True return False diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 5074341..08b8739 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -7,7 +7,7 @@ from typing import Any, Optional, cast from cleanlab_codex.internal.validator import ( - IsBadResponseConfig, + BadResponseThresholds, get_default_evaluations, get_default_trustworthyrag_config, ) @@ -31,7 +31,7 @@ def __init__( codex_access_key: str, tlm_api_key: Optional[str] = None, trustworthy_rag_config: Optional[dict[str, Any]] = None, - is_bad_response_config: Optional[dict[str, float]] = None, + bad_response_thresholds: Optional[dict[str, float]] = None, ): """Evaluates the quality of responses generated in RAG applications and remediates them if needed. @@ -41,12 +41,12 @@ def __init__( codex_access_key (str): The [access key](/codex/web_tutorials/create_project/#access-keys) for a Codex project. tlm_api_key (Optional[str]): The API key for [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag). trustworthy_rag_config (Optional[dict[str, Any]]): The constructor arguments for [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag). - is_bad_response_config (Optional[dict[str, float]]): The configuration for determining if a response is bad. + bad_response_thresholds (Optional[dict[str, float]]): The thresholds for determining if a response is bad. """ trustworthy_rag_config = trustworthy_rag_config or get_default_trustworthyrag_config() if tlm_api_key is not None: trustworthy_rag_config["api_key"] = tlm_api_key - self._is_bad_response_config = IsBadResponseConfig.model_validate(is_bad_response_config or {}) + self._bad_response_thresholds = BadResponseThresholds.model_validate(bad_response_thresholds or {}) self._project: Project = Project.from_access_key(access_key=codex_access_key) @@ -63,9 +63,9 @@ def validate(self, query: str, context: str, response: str) -> dict[str, Any]: Returns: dict[str, Any]: A dictionary containing: - - 'is_bad_response': True if the response is determined to be bad, False otherwise. - - 'expert_answer': The alternative response from Codex, or None if no response could be fetched from Codex. - - Other evaluation metrics from TrustworthyRAG. + - 'is_bad_response': True if the response is flagged as potentially bad, False otherwise. + - 'expert_answer': Alternate SME-provided answer from Codex, or None if no answer could be found in the Codex Project. + - Raw scores from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) for each evaluation metric. """ scores, is_bad_response = self.detect(query, context, response) expert_answer = None @@ -94,20 +94,19 @@ def detect(self, query: str, context: str, response: str) -> tuple[TrustworthyRA and configured thresholds, False otherwise. """ scores = cast(TrustworthyRAGScore, self._tlm_rag.score(response=response, query=query, context=context)) - _config = ( - IsBadResponseConfig.model_validate(self._is_bad_response_config) if self._is_bad_response_config else None - ) - is_bad_response = _is_bad_response(scores, _config) + + thresholds_dict = self._bad_response_thresholds.model_dump() + is_bad_response = _is_bad_response(scores, thresholds_dict) return scores, is_bad_response def remediate(self, query: str) -> str | None: - """Queries Codex to get an alternative response when the original response is determined to be bad. + """Request a SME-provided answer for this query, if one is available in Codex. Args: - query (str): The user's original query to get an alternative response for. + query (str): The user's original query to get SME-provided answer for. Returns: - str | None: The alternative response from Codex, or None if no response could be fetched from Codex. + str | None: The SME-provided answer from Codex, or None if no answer could be found in the Codex Project. """ codex_answer, _ = self._project.query(question=query) return codex_answer diff --git a/tests/internal/test_validator.py b/tests/internal/test_validator.py index d3ce3ab..847c31d 100644 --- a/tests/internal/test_validator.py +++ b/tests/internal/test_validator.py @@ -3,7 +3,7 @@ import pytest from cleanlab_tlm.utils.rag import TrustworthyRAGScore -from cleanlab_codex.internal.validator import IsBadResponseConfig, get_default_evaluations, is_bad_response +from cleanlab_codex.internal.validator import BadResponseThresholds, get_default_evaluations, is_bad_response def make_scores(trustworthiness: float, response_helpfulness: float) -> TrustworthyRAGScore: @@ -18,8 +18,8 @@ def make_scores(trustworthiness: float, response_helpfulness: float) -> Trustwor return cast(TrustworthyRAGScore, scores) -def make_is_bad_response_config(trustworthiness: float, response_helpfulness: float) -> IsBadResponseConfig: - return IsBadResponseConfig( +def make_is_bad_response_config(trustworthiness: float, response_helpfulness: float) -> BadResponseThresholds: + return BadResponseThresholds( trustworthiness=trustworthiness, response_helpfulness=response_helpfulness, ) @@ -35,7 +35,7 @@ def scores(self) -> TrustworthyRAGScore: return make_scores(0.92, 0.75) @pytest.fixture - def custom_is_bad_response_config(self) -> IsBadResponseConfig: + def custom_is_bad_response_config(self) -> BadResponseThresholds: return make_is_bad_response_config(0.6, 0.7) def test_thresholds(self, scores: TrustworthyRAGScore) -> None: @@ -50,6 +50,6 @@ def test_thresholds(self, scores: TrustworthyRAGScore) -> None: is_bad_response_config = make_is_bad_response_config(0.5, 0.751) assert is_bad_response(scores, is_bad_response_config) - def test_scores(self, custom_is_bad_response_config: IsBadResponseConfig) -> None: + def test_scores(self, custom_is_bad_response_config: BadResponseThresholds) -> None: scores = make_scores(0.59, 0.7) assert is_bad_response(scores, custom_is_bad_response_config) From a2c0ea58265c0011985a10040cd89efaee5ece59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Fri, 21 Mar 2025 15:43:00 +0000 Subject: [PATCH 05/35] have is_bad_response function take the BadResponseThreshold object instead of a dict --- src/cleanlab_codex/validator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 08b8739..a4baa23 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -95,8 +95,7 @@ def detect(self, query: str, context: str, response: str) -> tuple[TrustworthyRA """ scores = cast(TrustworthyRAGScore, self._tlm_rag.score(response=response, query=query, context=context)) - thresholds_dict = self._bad_response_thresholds.model_dump() - is_bad_response = _is_bad_response(scores, thresholds_dict) + is_bad_response = _is_bad_response(scores, self._bad_response_thresholds) return scores, is_bad_response def remediate(self, query: str) -> str | None: From b8a1e97d7ae4e4fc92015108ebd18b48276c6fbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Sat, 22 Mar 2025 00:44:23 +0000 Subject: [PATCH 06/35] Enhance Validator with flexible thresholds and improved error handling Adds support for custom evaluation thresholds, introduces ThresholdedTrustworthyRAGScore type, and improves validation error handling with better documentation. --- src/cleanlab_codex/internal/validator.py | 86 +++++++++++++++++------- src/cleanlab_codex/types/validator.py | 35 ++++++++++ src/cleanlab_codex/validator.py | 63 +++++++++++------ tests/internal/test_validator.py | 3 - 4 files changed, 139 insertions(+), 48 deletions(-) create mode 100644 src/cleanlab_codex/types/validator.py diff --git a/src/cleanlab_codex/internal/validator.py b/src/cleanlab_codex/internal/validator.py index 09b91b9..1c5c54f 100644 --- a/src/cleanlab_codex/internal/validator.py +++ b/src/cleanlab_codex/internal/validator.py @@ -1,8 +1,8 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING, Any, cast -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator from cleanlab_codex.utils.errors import MissingDependencyError @@ -14,13 +14,12 @@ package_url="https://github.com/cleanlab/cleanlab-tlm", ) from e +if TYPE_CHECKING: + from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore -"""Evaluation metrics (excluding trustworthiness) that are used to determine if a response is bad.""" -EVAL_METRICS = ["response_helpfulness"] - -"""Evaluation metrics that are used to determine if a response is bad.""" -BAD_RESPONSE_EVAL_METRICS = ["trustworthiness", *EVAL_METRICS] +"""Evaluation metrics (excluding trustworthiness) that are used to determine if a response is bad.""" +DEFAULT_EVAL_METRICS = ["response_helpfulness"] class BadResponseThresholds(BaseModel): @@ -29,18 +28,53 @@ class BadResponseThresholds(BaseModel): """ trustworthiness: float = Field( - description="Threshold for trustworthiness. If the score is below this threshold, the response is bad.", + description="Threshold for trustworthiness.", default=0.5, - ge=0, - le=1, + ge=0.0, + le=1.0, ) response_helpfulness: float = Field( - description="Threshold for response helpfulness. If the score is below this threshold, the response is bad.", + description="Threshold for response helpfulness.", default=0.5, - ge=0, - le=1, + ge=0.0, + le=1.0, ) + @property + def default_threshold(self) -> float: + """The default threshold to use when a specific evaluation metric's threshold is not set. This threshold is set to 0.5.""" + return 0.5 + + def get_threshold(self, eval_name: str) -> float: + """Get threshold for an eval if it exists. + + For fields defined in the model, returns their value (which may be the field's default). + For custom evals not defined in the model, returns the default threshold value (see `default_threshold`). + """ + + # For fields defined in the model, use their value (which may be the field's default) + if eval_name in self.model_fields: + return cast(float, getattr(self, eval_name)) + + # For custom evals, use the default threshold + return getattr(self, eval_name, self.default_threshold) + + @field_validator("*") + @classmethod + def validate_threshold(cls, v: Any) -> float: + """Validate that all fields (including dynamic ones) are floats between 0 and 1.""" + if not isinstance(v, (int, float)): + error_msg = f"Threshold must be a number, got {type(v)}" + raise TypeError(error_msg) + if not 0 <= float(v) <= 1: + error_msg = f"Threshold must be between 0 and 1, got {v}" + raise ValueError(error_msg) + return float(v) + + model_config = { + "extra": "allow" # Allow additional fields for custom eval thresholds + } + def get_default_evaluations() -> list[Eval]: """Get the default evaluations for the TrustworthyRAG. @@ -48,26 +82,28 @@ def get_default_evaluations() -> list[Eval]: Note: This excludes trustworthiness, which is automatically computed by TrustworthyRAG. """ - return [evaluation for evaluation in get_default_evals() if evaluation.name in EVAL_METRICS] - - -DEFAULT_TRUSTWORTHYRAG_CONFIG = { - "options": { - "log": ["explanation"], - }, -} + return [evaluation for evaluation in get_default_evals() if evaluation.name in DEFAULT_EVAL_METRICS] def get_default_trustworthyrag_config() -> dict[str, Any]: """Get the default configuration for the TrustworthyRAG.""" - return DEFAULT_TRUSTWORTHYRAG_CONFIG + return { + "options": { + "log": ["explanation"], + }, + } -def is_bad_response(scores: TrustworthyRAGScore, thresholds: dict[str, float]) -> bool: +def is_bad_response( + scores: TrustworthyRAGScore | ThresholdedTrustworthyRAGScore, thresholds: BadResponseThresholds +) -> bool: """ Check if the response is bad based on the scores computed by TrustworthyRAG and the config containing thresholds. """ - for eval_metric, threshold in thresholds.items(): - if scores[eval_metric]["score"] < threshold: + for eval_metric, score_dict in scores.items(): + score = score_dict["score"] + if score is None: + continue + if score < thresholds.get_threshold(eval_metric): return True return False diff --git a/src/cleanlab_codex/types/validator.py b/src/cleanlab_codex/types/validator.py new file mode 100644 index 0000000..25b5a1d --- /dev/null +++ b/src/cleanlab_codex/types/validator.py @@ -0,0 +1,35 @@ +from cleanlab_tlm.utils.rag import EvalMetric + + +class ThresholdedEvalMetric(EvalMetric): + is_bad: bool + + +ThresholdedEvalMetric.__doc__ = f""" +{EvalMetric.__doc__} + +is_bad: bool + Whether the score is a certain threshold. +""" + + +class ThresholdedTrustworthyRAGScore(dict[str, ThresholdedEvalMetric]): + """Object returned by `Validator.detect` containing evaluation scores from [TrustworthyRAGScore](/tlm/api/python/utils.rag/#class-trustworthyragscore) + along with a boolean flag, `is_bad`, indicating whether the score is below the threshold. + + Example: + ```python + { + "trustworthiness": { + "score": 0.92, + "log": {"explanation": "Did not find a reason to doubt trustworthiness."}, + "is_bad": False + }, + "response_helpfulness": { + "score": 0.35, + "is_bad": True + }, + ... + } + ``` + """ \ No newline at end of file diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index a4baa23..76bb760 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -13,11 +13,11 @@ ) from cleanlab_codex.internal.validator import is_bad_response as _is_bad_response from cleanlab_codex.project import Project +from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore from cleanlab_codex.utils.errors import MissingDependencyError try: from cleanlab_tlm import TrustworthyRAG - from cleanlab_tlm.utils.rag import TrustworthyRAGScore except ImportError as e: raise MissingDependencyError( import_name=e.name or "cleanlab-tlm", @@ -40,32 +40,47 @@ def __init__( Args: codex_access_key (str): The [access key](/codex/web_tutorials/create_project/#access-keys) for a Codex project. tlm_api_key (Optional[str]): The API key for [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag). - trustworthy_rag_config (Optional[dict[str, Any]]): The constructor arguments for [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag). - bad_response_thresholds (Optional[dict[str, float]]): The thresholds for determining if a response is bad. + trustworthy_rag_config (Optional[dict[str, Any]]): Optional initialization arguments for [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag), which is used to detect response issues. + bad_response_thresholds (Optional[dict[str, float]]): Detection score thresholds used to flag whether or not a response is considered bad. Each key in this dict corresponds to an Eval from TrustworthyRAG, and the value indicates a threshold below which scores from this Eval are considered detected issues. A response is flagged as bad if any issues are detected for it. """ trustworthy_rag_config = trustworthy_rag_config or get_default_trustworthyrag_config() + if tlm_api_key is not None and "api_key" in trustworthy_rag_config: + error_msg = "Cannot specify both tlm_api_key and api_key in trustworthy_rag_config" + raise ValueError(error_msg) if tlm_api_key is not None: trustworthy_rag_config["api_key"] = tlm_api_key - self._bad_response_thresholds = BadResponseThresholds.model_validate(bad_response_thresholds or {}) self._project: Project = Project.from_access_key(access_key=codex_access_key) trustworthy_rag_config.setdefault("evals", get_default_evaluations()) self._tlm_rag = TrustworthyRAG(**trustworthy_rag_config) + # Validate that all the necessary thresholds are present in the TrustworthyRAG. + _evals = [e.name for e in self._tlm_rag.get_evals()] + ["trustworthiness"] + + self._bad_response_thresholds = BadResponseThresholds.model_validate(bad_response_thresholds or {}) + + _threshold_keys = self._bad_response_thresholds.model_dump().keys() + + # Check if there are any thresholds without corresponding evals (this is an error) + _extra_thresholds = set(_threshold_keys) - set(_evals) + if _extra_thresholds: + error_msg = f"Found thresholds for non-existent evaluation metrics: {_extra_thresholds}" + raise ValueError(error_msg) + def validate(self, query: str, context: str, response: str) -> dict[str, Any]: - """Validate the response quality and generate an alternative response if needed. + """Evaluate whether the AI-generated response is bad, and if so, request an alternate expert response. Args: - query (str): The user's original query. - context (str): The context provided to generate the response. - response (str): The response to evaluate. + query (str): The user query that was used to generate the response. + context (str): The context that was retrieved from the RAG Knowledge Base and used to generate the response. + response (str): A reponse from your LLM/RAG system. Returns: dict[str, Any]: A dictionary containing: - 'is_bad_response': True if the response is flagged as potentially bad, False otherwise. - - 'expert_answer': Alternate SME-provided answer from Codex, or None if no answer could be found in the Codex Project. - - Raw scores from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) for each evaluation metric. + - 'expert_answer': Alternate SME-provided answer from Codex, or None if no answer could be found in the Codex Project. + - Additional keys: Various keys from a [`ThresholdedTrustworthyRAGScore`](/cleanlab_codex/types/validator/#class-thresholdedtrustworthyragscore) dictionary, with raw scores from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) for each evaluation metric. `is_bad` indicating whether the score is below the threshold. """ scores, is_bad_response = self.detect(query, context, response) expert_answer = None @@ -78,28 +93,36 @@ def validate(self, query: str, context: str, response: str) -> dict[str, Any]: **scores, } - def detect(self, query: str, context: str, response: str) -> tuple[TrustworthyRAGScore, bool]: + def detect(self, query: str, context: str, response: str) -> tuple[ThresholdedTrustworthyRAGScore, bool]: """Evaluate the response quality using TrustworthyRAG and determine if it is a bad response. Args: - query (str): The user's original query. - context (str): The context provided to generate the response. - response (str): The response to evaluate. + query (str): The user query that was used to generate the response. + context (str): The context that was retrieved from the RAG Knowledge Base and used to generate the response. + response (str): A reponse from your LLM/RAG system. Returns: - tuple[TrustworthyRAGScore, bool]: A tuple containing: - - TrustworthyRAGScore: Quality scores for different evaluation metrics like trustworthiness - and response helpfulness. Each metric has a score between 0-1. + tuple[ThresholdedTrustworthyRAGScore, bool]: A tuple containing: + - ThresholdedTrustworthyRAGScore: Quality scores for different evaluation metrics like trustworthiness + and response helpfulness. Each metric has a score between 0-1. It also has a boolean flag, `is_bad` indicating whether the score is below a given threshold. - bool: True if the response is determined to be bad based on the evaluation scores and configured thresholds, False otherwise. """ - scores = cast(TrustworthyRAGScore, self._tlm_rag.score(response=response, query=query, context=context)) - + scores = cast( + ThresholdedTrustworthyRAGScore, self._tlm_rag.score(response=response, query=query, context=context) + ) + + # Enhance each score dictionary with its threshold check + for eval_name, score_dict in scores.items(): + score_dict.setdefault("is_bad", False) + if (score := score_dict["score"]) is not None: + score_dict["is_bad"] = score < self._bad_response_thresholds.get_threshold(eval_name) + is_bad_response = _is_bad_response(scores, self._bad_response_thresholds) return scores, is_bad_response def remediate(self, query: str) -> str | None: - """Request a SME-provided answer for this query, if one is available in Codex. + """Request a SME-provided answer for this query, if one is available in Codex. Args: query (str): The user's original query to get SME-provided answer for. diff --git a/tests/internal/test_validator.py b/tests/internal/test_validator.py index 847c31d..45b9719 100644 --- a/tests/internal/test_validator.py +++ b/tests/internal/test_validator.py @@ -39,9 +39,6 @@ def custom_is_bad_response_config(self) -> BadResponseThresholds: return make_is_bad_response_config(0.6, 0.7) def test_thresholds(self, scores: TrustworthyRAGScore) -> None: - default_is_bad_response = is_bad_response(scores) - assert not default_is_bad_response - # High trustworthiness_threshold is_bad_response_config = make_is_bad_response_config(0.921, 0.5) assert is_bad_response(scores, is_bad_response_config) From db5fe24e5d31c2d31ed130617b0ed5b3979c454b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Sat, 22 Mar 2025 00:49:57 +0000 Subject: [PATCH 07/35] move BadResponseThresholds --- src/cleanlab_codex/internal/validator.py | 62 ++---------------------- src/cleanlab_codex/types/validator.py | 2 +- src/cleanlab_codex/validator.py | 57 +++++++++++++++++++++- tests/internal/test_validator.py | 3 +- 4 files changed, 63 insertions(+), 61 deletions(-) diff --git a/src/cleanlab_codex/internal/validator.py b/src/cleanlab_codex/internal/validator.py index 1c5c54f..0526f9e 100644 --- a/src/cleanlab_codex/internal/validator.py +++ b/src/cleanlab_codex/internal/validator.py @@ -1,8 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, cast - -from pydantic import BaseModel, Field, field_validator +from typing import TYPE_CHECKING, Any from cleanlab_codex.utils.errors import MissingDependencyError @@ -16,66 +14,13 @@ if TYPE_CHECKING: from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore + from cleanlab_codex.validator import BadResponseThresholds """Evaluation metrics (excluding trustworthiness) that are used to determine if a response is bad.""" DEFAULT_EVAL_METRICS = ["response_helpfulness"] -class BadResponseThresholds(BaseModel): - """Config for determining if a response is bad. - Each key is an evaluation metric and the value is a threshold such that if the score is below the threshold, the response is bad. - """ - - trustworthiness: float = Field( - description="Threshold for trustworthiness.", - default=0.5, - ge=0.0, - le=1.0, - ) - response_helpfulness: float = Field( - description="Threshold for response helpfulness.", - default=0.5, - ge=0.0, - le=1.0, - ) - - @property - def default_threshold(self) -> float: - """The default threshold to use when a specific evaluation metric's threshold is not set. This threshold is set to 0.5.""" - return 0.5 - - def get_threshold(self, eval_name: str) -> float: - """Get threshold for an eval if it exists. - - For fields defined in the model, returns their value (which may be the field's default). - For custom evals not defined in the model, returns the default threshold value (see `default_threshold`). - """ - - # For fields defined in the model, use their value (which may be the field's default) - if eval_name in self.model_fields: - return cast(float, getattr(self, eval_name)) - - # For custom evals, use the default threshold - return getattr(self, eval_name, self.default_threshold) - - @field_validator("*") - @classmethod - def validate_threshold(cls, v: Any) -> float: - """Validate that all fields (including dynamic ones) are floats between 0 and 1.""" - if not isinstance(v, (int, float)): - error_msg = f"Threshold must be a number, got {type(v)}" - raise TypeError(error_msg) - if not 0 <= float(v) <= 1: - error_msg = f"Threshold must be between 0 and 1, got {v}" - raise ValueError(error_msg) - return float(v) - - model_config = { - "extra": "allow" # Allow additional fields for custom eval thresholds - } - - def get_default_evaluations() -> list[Eval]: """Get the default evaluations for the TrustworthyRAG. @@ -95,7 +40,8 @@ def get_default_trustworthyrag_config() -> dict[str, Any]: def is_bad_response( - scores: TrustworthyRAGScore | ThresholdedTrustworthyRAGScore, thresholds: BadResponseThresholds + scores: TrustworthyRAGScore | ThresholdedTrustworthyRAGScore, + thresholds: BadResponseThresholds, ) -> bool: """ Check if the response is bad based on the scores computed by TrustworthyRAG and the config containing thresholds. diff --git a/src/cleanlab_codex/types/validator.py b/src/cleanlab_codex/types/validator.py index 25b5a1d..930273f 100644 --- a/src/cleanlab_codex/types/validator.py +++ b/src/cleanlab_codex/types/validator.py @@ -32,4 +32,4 @@ class ThresholdedTrustworthyRAGScore(dict[str, ThresholdedEvalMetric]): ... } ``` - """ \ No newline at end of file + """ diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 76bb760..92251a3 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -6,8 +6,9 @@ from typing import Any, Optional, cast +from pydantic import BaseModel, Field, field_validator + from cleanlab_codex.internal.validator import ( - BadResponseThresholds, get_default_evaluations, get_default_trustworthyrag_config, ) @@ -25,6 +26,60 @@ ) from e +class BadResponseThresholds(BaseModel): + """Config for determining if a response is bad. + Each key is an evaluation metric and the value is a threshold such that if the score is below the threshold, the response is bad. + """ + + trustworthiness: float = Field( + description="Threshold for trustworthiness.", + default=0.5, + ge=0.0, + le=1.0, + ) + response_helpfulness: float = Field( + description="Threshold for response helpfulness.", + default=0.5, + ge=0.0, + le=1.0, + ) + + @property + def default_threshold(self) -> float: + """The default threshold to use when a specific evaluation metric's threshold is not set. This threshold is set to 0.5.""" + return 0.5 + + def get_threshold(self, eval_name: str) -> float: + """Get threshold for an eval if it exists. + + For fields defined in the model, returns their value (which may be the field's default). + For custom evals not defined in the model, returns the default threshold value (see `default_threshold`). + """ + + # For fields defined in the model, use their value (which may be the field's default) + if eval_name in self.model_fields: + return cast(float, getattr(self, eval_name)) + + # For custom evals, use the default threshold + return getattr(self, eval_name, self.default_threshold) + + @field_validator("*") + @classmethod + def validate_threshold(cls, v: Any) -> float: + """Validate that all fields (including dynamic ones) are floats between 0 and 1.""" + if not isinstance(v, (int, float)): + error_msg = f"Threshold must be a number, got {type(v)}" + raise TypeError(error_msg) + if not 0 <= float(v) <= 1: + error_msg = f"Threshold must be between 0 and 1, got {v}" + raise ValueError(error_msg) + return float(v) + + model_config = { + "extra": "allow" # Allow additional fields for custom eval thresholds + } + + class Validator: def __init__( self, diff --git a/tests/internal/test_validator.py b/tests/internal/test_validator.py index 45b9719..a2f3146 100644 --- a/tests/internal/test_validator.py +++ b/tests/internal/test_validator.py @@ -3,7 +3,8 @@ import pytest from cleanlab_tlm.utils.rag import TrustworthyRAGScore -from cleanlab_codex.internal.validator import BadResponseThresholds, get_default_evaluations, is_bad_response +from cleanlab_codex.internal.validator import get_default_evaluations, is_bad_response +from cleanlab_codex.validator import BadResponseThresholds def make_scores(trustworthiness: float, response_helpfulness: float) -> TrustworthyRAGScore: From 29e231abb8b5d13b1cc3eb46fcb8177ed9913650 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Mon, 24 Mar 2025 11:32:53 -0700 Subject: [PATCH 08/35] add prompt and form_prompt --- src/cleanlab_codex/internal/validator.py | 9 ++++++ src/cleanlab_codex/validator.py | 38 +++++++++++++++++------- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/cleanlab_codex/internal/validator.py b/src/cleanlab_codex/internal/validator.py index 0526f9e..3cc55b9 100644 --- a/src/cleanlab_codex/internal/validator.py +++ b/src/cleanlab_codex/internal/validator.py @@ -39,6 +39,15 @@ def get_default_trustworthyrag_config() -> dict[str, Any]: } +def update_scores_based_on_thresholds(scores: dict[dict[str, Any]], thresholds: BadResponseThresholds) -> None: + """Adds a `""" + detection_flag = "is_bad" + for eval_name, score_dict in scores.items(): + score_dict.setdefault(detection_flag, False) + if (score := score_dict["score"]) is not None: + score_dict[detection_flag] = score < thresholds.get_threshold(eval_name) + + def is_bad_response( scores: TrustworthyRAGScore | ThresholdedTrustworthyRAGScore, thresholds: BadResponseThresholds, diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 92251a3..2929845 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import Any, Optional, cast +from typing import Any, Callable, Optional, cast from pydantic import BaseModel, Field, field_validator @@ -13,6 +13,7 @@ get_default_trustworthyrag_config, ) from cleanlab_codex.internal.validator import is_bad_response as _is_bad_response +from cleanlab_codex.internal.validator import update_scores_based_on_thresholds as _update_scores_based_on_thresholds from cleanlab_codex.project import Project from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore from cleanlab_codex.utils.errors import MissingDependencyError @@ -123,7 +124,14 @@ def __init__( error_msg = f"Found thresholds for non-existent evaluation metrics: {_extra_thresholds}" raise ValueError(error_msg) - def validate(self, query: str, context: str, response: str) -> dict[str, Any]: + def validate( + self, + query: str, + context: str, + response: str, + prompt: Optional[str] = None, + form_prompt: Optional[Callable[[str, str], str]] = None, + ) -> dict[str, Any]: """Evaluate whether the AI-generated response is bad, and if so, request an alternate expert response. Args: @@ -137,7 +145,7 @@ def validate(self, query: str, context: str, response: str) -> dict[str, Any]: - 'expert_answer': Alternate SME-provided answer from Codex, or None if no answer could be found in the Codex Project. - Additional keys: Various keys from a [`ThresholdedTrustworthyRAGScore`](/cleanlab_codex/types/validator/#class-thresholdedtrustworthyragscore) dictionary, with raw scores from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) for each evaluation metric. `is_bad` indicating whether the score is below the threshold. """ - scores, is_bad_response = self.detect(query, context, response) + scores, is_bad_response = self.detect(query, context, response, prompt, form_prompt) expert_answer = None if is_bad_response: expert_answer = self.remediate(query) @@ -148,7 +156,14 @@ def validate(self, query: str, context: str, response: str) -> dict[str, Any]: **scores, } - def detect(self, query: str, context: str, response: str) -> tuple[ThresholdedTrustworthyRAGScore, bool]: + def detect( + self, + query: str, + context: str, + response: str, + prompt: Optional[str] = None, + form_prompt: Optional[Callable[[str, str], str]] = None, + ) -> tuple[ThresholdedTrustworthyRAGScore, bool]: """Evaluate the response quality using TrustworthyRAG and determine if it is a bad response. Args: @@ -164,14 +179,17 @@ def detect(self, query: str, context: str, response: str) -> tuple[ThresholdedTr and configured thresholds, False otherwise. """ scores = cast( - ThresholdedTrustworthyRAGScore, self._tlm_rag.score(response=response, query=query, context=context) + ThresholdedTrustworthyRAGScore, + self._tlm_rag.score( + response=response, + query=query, + context=context, + prompt=prompt, + form_prompt=form_prompt, + ), ) - # Enhance each score dictionary with its threshold check - for eval_name, score_dict in scores.items(): - score_dict.setdefault("is_bad", False) - if (score := score_dict["score"]) is not None: - score_dict["is_bad"] = score < self._bad_response_thresholds.get_threshold(eval_name) + _update_scores_based_on_thresholds(scores, thresholds=self._bad_response_thresholds) is_bad_response = _is_bad_response(scores, self._bad_response_thresholds) return scores, is_bad_response From a741e159a99eb6b77c149c16911e373d618525d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Mon, 24 Mar 2025 11:46:56 -0700 Subject: [PATCH 09/35] fix formatting and type hints --- src/cleanlab_codex/internal/validator.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/cleanlab_codex/internal/validator.py b/src/cleanlab_codex/internal/validator.py index 3cc55b9..67ceb12 100644 --- a/src/cleanlab_codex/internal/validator.py +++ b/src/cleanlab_codex/internal/validator.py @@ -39,13 +39,14 @@ def get_default_trustworthyrag_config() -> dict[str, Any]: } -def update_scores_based_on_thresholds(scores: dict[dict[str, Any]], thresholds: BadResponseThresholds) -> None: - """Adds a `""" - detection_flag = "is_bad" +def update_scores_based_on_thresholds( + scores: ThresholdedTrustworthyRAGScore, thresholds: BadResponseThresholds +) -> None: + """Adds a `is_bad` flag to the scores dictionary.""" for eval_name, score_dict in scores.items(): - score_dict.setdefault(detection_flag, False) + score_dict.setdefault("is_bad", False) if (score := score_dict["score"]) is not None: - score_dict[detection_flag] = score < thresholds.get_threshold(eval_name) + score_dict["is_bad"] = score < thresholds.get_threshold(eval_name) def is_bad_response( From 380b1efdadbe88f5032240fc15a5d18cce8d7f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Mon, 24 Mar 2025 11:55:49 -0700 Subject: [PATCH 10/35] update docstrings --- src/cleanlab_codex/internal/validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleanlab_codex/internal/validator.py b/src/cleanlab_codex/internal/validator.py index 67ceb12..c9b97de 100644 --- a/src/cleanlab_codex/internal/validator.py +++ b/src/cleanlab_codex/internal/validator.py @@ -42,7 +42,7 @@ def get_default_trustworthyrag_config() -> dict[str, Any]: def update_scores_based_on_thresholds( scores: ThresholdedTrustworthyRAGScore, thresholds: BadResponseThresholds ) -> None: - """Adds a `is_bad` flag to the scores dictionary.""" + """Adds a `is_bad` flag to the scores dictionaries based on the thresholds.""" for eval_name, score_dict in scores.items(): score_dict.setdefault("is_bad", False) if (score := score_dict["score"]) is not None: From 4f40e3d89ef5f2fabf1b70562ea6a5d4d3fd0b5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Mon, 24 Mar 2025 21:18:50 -0700 Subject: [PATCH 11/35] Add unit tests for Validator and BadResponseThresholds --- tests/test_validator.py | 172 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 tests/test_validator.py diff --git a/tests/test_validator.py b/tests/test_validator.py new file mode 100644 index 0000000..cc74972 --- /dev/null +++ b/tests/test_validator.py @@ -0,0 +1,172 @@ +from unittest.mock import Mock, patch + +from pydantic import ValidationError +import pytest + +from cleanlab_codex.validator import BadResponseThresholds, Validator + + +class TestBadResponseThresholds: + def test_get_threshold(self): + thresholds = BadResponseThresholds( + trustworthiness=0.5, + response_helpfulness=0.5, + ) + assert thresholds.get_threshold("trustworthiness") == 0.5 + assert thresholds.get_threshold("response_helpfulness") == 0.5 + + def test_default_threshold(self): + thresholds = BadResponseThresholds() + assert thresholds.get_threshold("trustworthiness") == 0.5 + assert thresholds.get_threshold("response_helpfulness") == 0.5 + + def test_unspecified_threshold(self): + thresholds = BadResponseThresholds() + assert thresholds.get_threshold("unspecified_threshold") == 0.5 + + def test_threshold_value(self): + thresholds = BadResponseThresholds(valid_threshold=0.3) + assert thresholds.get_threshold("valid_threshold") == 0.3 + assert thresholds.valid_threshold == 0.3 + + def test_invalid_threshold_value(self): + with pytest.raises(ValidationError): + BadResponseThresholds(trustworthiness=1.1) + + with pytest.raises(ValidationError): + BadResponseThresholds(response_helpfulness=-0.1) + + def test_invalid_threshold_type(self): + with pytest.raises(ValidationError): + BadResponseThresholds(trustworthiness="not a number") + + +@pytest.fixture +def mock_project(): + with patch("cleanlab_codex.validator.Project") as mock: + mock.from_access_key.return_value = Mock() + yield mock + + +@pytest.fixture +def mock_trustworthy_rag(): + mock = Mock() + mock.score.return_value = { + "trustworthiness": { + "score": 0.8, + "is_bad": False + }, + "response_helpfulness": { + "score": 0.7, + "is_bad": False + } + } + eval_mock = Mock() + eval_mock.name = "response_helpfulness" + mock.get_evals.return_value = [eval_mock] + with patch("cleanlab_codex.validator.TrustworthyRAG") as mock_class: + mock_class.return_value = mock + yield mock_class + + +class TestValidator: + def test_init(self, mock_project, mock_trustworthy_rag): + Validator(codex_access_key="test") + + # Verify Project was initialized with access key + mock_project.from_access_key.assert_called_once_with(access_key="test") + + # Verify TrustworthyRAG was initialized with default config + mock_trustworthy_rag.assert_called_once() + + def test_init_with_tlm_api_key(self, mock_project, mock_trustworthy_rag): + Validator(codex_access_key="test", tlm_api_key="tlm-key") + + # Verify TrustworthyRAG was initialized with API key + config = mock_trustworthy_rag.call_args[1] + assert config["api_key"] == "tlm-key" + + def test_init_with_config_conflict(self, mock_project, mock_trustworthy_rag): + with pytest.raises(ValueError, match="Cannot specify both tlm_api_key and api_key in trustworthy_rag_config"): + Validator( + codex_access_key="test", + tlm_api_key="tlm-key", + trustworthy_rag_config={"api_key": "config-key"} + ) + + def test_validate(self, mock_project, mock_trustworthy_rag): + validator = Validator(codex_access_key="test") + + result = validator.validate( + query="test query", + context="test context", + response="test response" + ) + + # Verify TrustworthyRAG.score was called + mock_trustworthy_rag.return_value.score.assert_called_once_with( + response="test response", + query="test query", + context="test context", + prompt=None, + form_prompt=None + ) + + # Verify expected result structure + assert result["is_bad_response"] is False + assert result["expert_answer"] is None + + eval_metrics = ["trustworthiness", "response_helpfulness"] + for metric in eval_metrics: + assert metric in result + assert "score" in result[metric] + assert "is_bad" in result[metric] + + def test_validate_expert_answer(self, mock_project, mock_trustworthy_rag): + # Setup mock project query response + mock_project.from_access_key.return_value.query.return_value = ("expert answer", None) + + # Basically any response will be flagged as untrustworthy + validator = Validator(codex_access_key="test", bad_response_thresholds={"trustworthiness": 1.0}) + result = validator.validate( + query="test query", + context="test context", + response="test response" + ) + assert result["expert_answer"] == "expert answer" + + mock_project.from_access_key.return_value.query.return_value = (None, None) + result = validator.validate( + query="test query", + context="test context", + response="test response" + ) + assert result["expert_answer"] is None + + + def test_detect(self, mock_project, mock_trustworthy_rag): + validator = Validator(codex_access_key="test") + + scores, is_bad = validator.detect( + query="test query", + context="test context", + response="test response" + ) + + # Verify scores match mock return value + assert scores["trustworthiness"]["score"] == 0.8 + assert scores["response_helpfulness"]["score"] == 0.7 + assert not is_bad # Since mock scores are above default thresholds + + def test_remediate(self, mock_project, mock_trustworthy_rag): + # Setup mock project query response + mock_project.from_access_key.return_value.query.return_value = ("expert answer", None) + + validator = Validator(codex_access_key="test") + result = validator.remediate("test query") + + # Verify project.query was called + mock_project.from_access_key.return_value.query.assert_called_once_with( + question="test query" + ) + assert result == "expert answer" \ No newline at end of file From 02b16e0ca1e1ad6020f9f211a8d0f462c9204308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Mon, 24 Mar 2025 21:24:43 -0700 Subject: [PATCH 12/35] include type hints and fix formatting --- tests/test_validator.py | 134 +++++++++++++++------------------------- 1 file changed, 51 insertions(+), 83 deletions(-) diff --git a/tests/test_validator.py b/tests/test_validator.py index cc74972..abcff32 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -1,13 +1,14 @@ +from typing import Generator from unittest.mock import Mock, patch -from pydantic import ValidationError import pytest +from pydantic import ValidationError from cleanlab_codex.validator import BadResponseThresholds, Validator class TestBadResponseThresholds: - def test_get_threshold(self): + def test_get_threshold(self) -> None: thresholds = BadResponseThresholds( trustworthiness=0.5, response_helpfulness=0.5, @@ -15,51 +16,45 @@ def test_get_threshold(self): assert thresholds.get_threshold("trustworthiness") == 0.5 assert thresholds.get_threshold("response_helpfulness") == 0.5 - def test_default_threshold(self): + def test_default_threshold(self) -> None: thresholds = BadResponseThresholds() assert thresholds.get_threshold("trustworthiness") == 0.5 assert thresholds.get_threshold("response_helpfulness") == 0.5 - def test_unspecified_threshold(self): + def test_unspecified_threshold(self) -> None: thresholds = BadResponseThresholds() assert thresholds.get_threshold("unspecified_threshold") == 0.5 - - def test_threshold_value(self): - thresholds = BadResponseThresholds(valid_threshold=0.3) + + def test_threshold_value(self) -> None: + thresholds = BadResponseThresholds(valid_threshold=0.3) # type: ignore assert thresholds.get_threshold("valid_threshold") == 0.3 - assert thresholds.valid_threshold == 0.3 + assert thresholds.valid_threshold == 0.3 # type: ignore - def test_invalid_threshold_value(self): + def test_invalid_threshold_value(self) -> None: with pytest.raises(ValidationError): BadResponseThresholds(trustworthiness=1.1) - + with pytest.raises(ValidationError): BadResponseThresholds(response_helpfulness=-0.1) - def test_invalid_threshold_type(self): + def test_invalid_threshold_type(self) -> None: with pytest.raises(ValidationError): - BadResponseThresholds(trustworthiness="not a number") + BadResponseThresholds(trustworthiness="not a number") # type: ignore @pytest.fixture -def mock_project(): +def mock_project() -> Generator[Mock, None, None]: with patch("cleanlab_codex.validator.Project") as mock: mock.from_access_key.return_value = Mock() yield mock @pytest.fixture -def mock_trustworthy_rag(): +def mock_trustworthy_rag() -> Generator[Mock, None, None]: mock = Mock() mock.score.return_value = { - "trustworthiness": { - "score": 0.8, - "is_bad": False - }, - "response_helpfulness": { - "score": 0.7, - "is_bad": False - } + "trustworthiness": {"score": 0.8, "is_bad": False}, + "response_helpfulness": {"score": 0.7, "is_bad": False}, } eval_mock = Mock() eval_mock.name = "response_helpfulness" @@ -70,103 +65,76 @@ def mock_trustworthy_rag(): class TestValidator: - def test_init(self, mock_project, mock_trustworthy_rag): + def test_init(self, mock_project: Mock, mock_trustworthy_rag: Mock) -> None: Validator(codex_access_key="test") - + # Verify Project was initialized with access key mock_project.from_access_key.assert_called_once_with(access_key="test") - + # Verify TrustworthyRAG was initialized with default config mock_trustworthy_rag.assert_called_once() - - def test_init_with_tlm_api_key(self, mock_project, mock_trustworthy_rag): + + def test_init_with_tlm_api_key(self, mock_project: Mock, mock_trustworthy_rag: Mock) -> None: # noqa: ARG002 Validator(codex_access_key="test", tlm_api_key="tlm-key") - + # Verify TrustworthyRAG was initialized with API key config = mock_trustworthy_rag.call_args[1] assert config["api_key"] == "tlm-key" - - def test_init_with_config_conflict(self, mock_project, mock_trustworthy_rag): + + def test_init_with_config_conflict(self, mock_project: Mock, mock_trustworthy_rag: Mock) -> None: # noqa: ARG002 with pytest.raises(ValueError, match="Cannot specify both tlm_api_key and api_key in trustworthy_rag_config"): - Validator( - codex_access_key="test", - tlm_api_key="tlm-key", - trustworthy_rag_config={"api_key": "config-key"} - ) - - def test_validate(self, mock_project, mock_trustworthy_rag): + Validator(codex_access_key="test", tlm_api_key="tlm-key", trustworthy_rag_config={"api_key": "config-key"}) + + def test_validate(self, mock_project: Mock, mock_trustworthy_rag: Mock) -> None: # noqa: ARG002 validator = Validator(codex_access_key="test") - - result = validator.validate( - query="test query", - context="test context", - response="test response" - ) - + + result = validator.validate(query="test query", context="test context", response="test response") + # Verify TrustworthyRAG.score was called mock_trustworthy_rag.return_value.score.assert_called_once_with( - response="test response", - query="test query", - context="test context", - prompt=None, - form_prompt=None + response="test response", query="test query", context="test context", prompt=None, form_prompt=None ) - + # Verify expected result structure assert result["is_bad_response"] is False assert result["expert_answer"] is None - + eval_metrics = ["trustworthiness", "response_helpfulness"] for metric in eval_metrics: assert metric in result assert "score" in result[metric] assert "is_bad" in result[metric] - - def test_validate_expert_answer(self, mock_project, mock_trustworthy_rag): + + def test_validate_expert_answer(self, mock_project: Mock, mock_trustworthy_rag: Mock) -> None: # noqa: ARG002 # Setup mock project query response mock_project.from_access_key.return_value.query.return_value = ("expert answer", None) - + # Basically any response will be flagged as untrustworthy validator = Validator(codex_access_key="test", bad_response_thresholds={"trustworthiness": 1.0}) - result = validator.validate( - query="test query", - context="test context", - response="test response" - ) + result = validator.validate(query="test query", context="test context", response="test response") assert result["expert_answer"] == "expert answer" - + mock_project.from_access_key.return_value.query.return_value = (None, None) - result = validator.validate( - query="test query", - context="test context", - response="test response" - ) + result = validator.validate(query="test query", context="test context", response="test response") assert result["expert_answer"] is None - - - def test_detect(self, mock_project, mock_trustworthy_rag): + + def test_detect(self, mock_project: Mock, mock_trustworthy_rag: Mock) -> None: # noqa: ARG002 validator = Validator(codex_access_key="test") - - scores, is_bad = validator.detect( - query="test query", - context="test context", - response="test response" - ) - + + scores, is_bad = validator.detect(query="test query", context="test context", response="test response") + # Verify scores match mock return value assert scores["trustworthiness"]["score"] == 0.8 assert scores["response_helpfulness"]["score"] == 0.7 assert not is_bad # Since mock scores are above default thresholds - - def test_remediate(self, mock_project, mock_trustworthy_rag): + + def test_remediate(self, mock_project: Mock, mock_trustworthy_rag: Mock) -> None: # noqa: ARG002 # Setup mock project query response mock_project.from_access_key.return_value.query.return_value = ("expert answer", None) - + validator = Validator(codex_access_key="test") result = validator.remediate("test query") - + # Verify project.query was called - mock_project.from_access_key.return_value.query.assert_called_once_with( - question="test query" - ) - assert result == "expert answer" \ No newline at end of file + mock_project.from_access_key.return_value.query.assert_called_once_with(question="test query") + assert result == "expert answer" From 873f55218954da741a9150437c07941bf8c78c72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Tue, 25 Mar 2025 14:45:50 -0700 Subject: [PATCH 13/35] set "expert_answer" as first key --- src/cleanlab_codex/validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 2929845..855fd6e 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -151,8 +151,8 @@ def validate( expert_answer = self.remediate(query) return { - "is_bad_response": is_bad_response, "expert_answer": expert_answer, + "is_bad_response": is_bad_response, **scores, } From b4713712299ae4f18767c805bb8fda28d605aea5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Tue, 25 Mar 2025 16:48:39 -0700 Subject: [PATCH 14/35] clean up imports, type hints and docs --- src/cleanlab_codex/internal/validator.py | 48 +++++++++--------------- src/cleanlab_codex/validator.py | 45 ++++++++++------------ tests/internal/test_validator.py | 26 +------------ 3 files changed, 38 insertions(+), 81 deletions(-) diff --git a/src/cleanlab_codex/internal/validator.py b/src/cleanlab_codex/internal/validator.py index c9b97de..0914c02 100644 --- a/src/cleanlab_codex/internal/validator.py +++ b/src/cleanlab_codex/internal/validator.py @@ -1,19 +1,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional, Sequence, cast -from cleanlab_codex.utils.errors import MissingDependencyError +from cleanlab_tlm.utils.rag import Eval, TrustworthyRAGScore, get_default_evals -try: - from cleanlab_tlm.utils.rag import Eval, TrustworthyRAGScore, get_default_evals -except ImportError as e: - raise MissingDependencyError( - import_name=e.name or "cleanlab-tlm", - package_url="https://github.com/cleanlab/cleanlab-tlm", - ) from e +from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore if TYPE_CHECKING: - from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore from cleanlab_codex.validator import BadResponseThresholds @@ -40,26 +33,21 @@ def get_default_trustworthyrag_config() -> dict[str, Any]: def update_scores_based_on_thresholds( - scores: ThresholdedTrustworthyRAGScore, thresholds: BadResponseThresholds -) -> None: + scores: TrustworthyRAGScore | Sequence[TrustworthyRAGScore], thresholds: BadResponseThresholds +) -> ThresholdedTrustworthyRAGScore: """Adds a `is_bad` flag to the scores dictionaries based on the thresholds.""" - for eval_name, score_dict in scores.items(): - score_dict.setdefault("is_bad", False) - if (score := score_dict["score"]) is not None: - score_dict["is_bad"] = score < thresholds.get_threshold(eval_name) + # Helper function to check if a score is bad + def is_bad(score: Optional[float], threshold: float) -> bool: + return score is not None and score < threshold -def is_bad_response( - scores: TrustworthyRAGScore | ThresholdedTrustworthyRAGScore, - thresholds: BadResponseThresholds, -) -> bool: - """ - Check if the response is bad based on the scores computed by TrustworthyRAG and the config containing thresholds. - """ - for eval_metric, score_dict in scores.items(): - score = score_dict["score"] - if score is None: - continue - if score < thresholds.get_threshold(eval_metric): - return True - return False + if isinstance(scores, Sequence): + raise NotImplementedError("Batching is not supported yet.") + + thresholded_scores = {} + for eval_name, score_dict in scores.items(): + thresholded_scores[eval_name] = { + **score_dict, + "is_bad": is_bad(score_dict["score"], thresholds.get_threshold(eval_name)), + } + return cast(ThresholdedTrustworthyRAGScore, thresholded_scores) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 855fd6e..e4238a6 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -4,27 +4,20 @@ from __future__ import annotations -from typing import Any, Callable, Optional, cast +from typing import TYPE_CHECKING, Any, Callable, Optional, cast +from cleanlab_tlm import TrustworthyRAG from pydantic import BaseModel, Field, field_validator from cleanlab_codex.internal.validator import ( get_default_evaluations, get_default_trustworthyrag_config, ) -from cleanlab_codex.internal.validator import is_bad_response as _is_bad_response from cleanlab_codex.internal.validator import update_scores_based_on_thresholds as _update_scores_based_on_thresholds from cleanlab_codex.project import Project -from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore -from cleanlab_codex.utils.errors import MissingDependencyError -try: - from cleanlab_tlm import TrustworthyRAG -except ImportError as e: - raise MissingDependencyError( - import_name=e.name or "cleanlab-tlm", - package_url="https://github.com/cleanlab/cleanlab-tlm", - ) from e +if TYPE_CHECKING: + from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore class BadResponseThresholds(BaseModel): @@ -141,8 +134,8 @@ def validate( Returns: dict[str, Any]: A dictionary containing: - - 'is_bad_response': True if the response is flagged as potentially bad, False otherwise. - - 'expert_answer': Alternate SME-provided answer from Codex, or None if no answer could be found in the Codex Project. + - 'expert_answer': Alternate SME-provided answer from Codex if the response was flagged as bad and an answer was found, or None otherwise. + - 'is_bad_response': True if the response is flagged as potentially bad (when True, a lookup in Codex is performed), False otherwise. - Additional keys: Various keys from a [`ThresholdedTrustworthyRAGScore`](/cleanlab_codex/types/validator/#class-thresholdedtrustworthyragscore) dictionary, with raw scores from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) for each evaluation metric. `is_bad` indicating whether the score is below the threshold. """ scores, is_bad_response = self.detect(query, context, response, prompt, form_prompt) @@ -164,7 +157,7 @@ def detect( prompt: Optional[str] = None, form_prompt: Optional[Callable[[str, str], str]] = None, ) -> tuple[ThresholdedTrustworthyRAGScore, bool]: - """Evaluate the response quality using TrustworthyRAG and determine if it is a bad response. + """Evaluate the response quality using TrustworthyRAG and determine if it is a bad response via thresholding. Args: query (str): The user query that was used to generate the response. @@ -178,21 +171,21 @@ def detect( - bool: True if the response is determined to be bad based on the evaluation scores and configured thresholds, False otherwise. """ - scores = cast( - ThresholdedTrustworthyRAGScore, - self._tlm_rag.score( - response=response, - query=query, - context=context, - prompt=prompt, - form_prompt=form_prompt, - ), + scores = self._tlm_rag.score( + response=response, + query=query, + context=context, + prompt=prompt, + form_prompt=form_prompt, ) - _update_scores_based_on_thresholds(scores, thresholds=self._bad_response_thresholds) + thresholded_scores = _update_scores_based_on_thresholds( + scores=scores, + thresholds=self._bad_response_thresholds, + ) - is_bad_response = _is_bad_response(scores, self._bad_response_thresholds) - return scores, is_bad_response + is_bad_response = any(score_dict["is_bad"] for score_dict in thresholded_scores.values()) + return thresholded_scores, is_bad_response def remediate(self, query: str) -> str | None: """Request a SME-provided answer for this query, if one is available in Codex. diff --git a/tests/internal/test_validator.py b/tests/internal/test_validator.py index a2f3146..b2d059e 100644 --- a/tests/internal/test_validator.py +++ b/tests/internal/test_validator.py @@ -1,9 +1,8 @@ from typing import cast -import pytest from cleanlab_tlm.utils.rag import TrustworthyRAGScore -from cleanlab_codex.internal.validator import get_default_evaluations, is_bad_response +from cleanlab_codex.internal.validator import get_default_evaluations from cleanlab_codex.validator import BadResponseThresholds @@ -28,26 +27,3 @@ def make_is_bad_response_config(trustworthiness: float, response_helpfulness: fl def test_get_default_evaluations() -> None: assert {evaluation.name for evaluation in get_default_evaluations()} == {"response_helpfulness"} - - -class TestIsBadResponse: - @pytest.fixture - def scores(self) -> TrustworthyRAGScore: - return make_scores(0.92, 0.75) - - @pytest.fixture - def custom_is_bad_response_config(self) -> BadResponseThresholds: - return make_is_bad_response_config(0.6, 0.7) - - def test_thresholds(self, scores: TrustworthyRAGScore) -> None: - # High trustworthiness_threshold - is_bad_response_config = make_is_bad_response_config(0.921, 0.5) - assert is_bad_response(scores, is_bad_response_config) - - # High response_helpfulness_threshold - is_bad_response_config = make_is_bad_response_config(0.5, 0.751) - assert is_bad_response(scores, is_bad_response_config) - - def test_scores(self, custom_is_bad_response_config: BadResponseThresholds) -> None: - scores = make_scores(0.59, 0.7) - assert is_bad_response(scores, custom_is_bad_response_config) From be4745ce489ae84e1ba184e2d5d9cc7a78d2f048 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Tue, 25 Mar 2025 17:27:51 -0700 Subject: [PATCH 15/35] Update pyproject.toml Co-authored-by: Anish Athalye --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6b36862..ecba729 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "cleanlab-tlm>=1.0.12", + "cleanlab-tlm~=1.0.12", "codex-sdk==0.1.0a12", "pydantic>=2.0.0, <3", ] From 54e866b2aff3fef47c6869230f07449f1fb4b901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Tue, 25 Mar 2025 18:30:05 -0700 Subject: [PATCH 16/35] Update response_validation.py docstring to indicate module deprecation in favor of the new Validator API. --- src/cleanlab_codex/response_validation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cleanlab_codex/response_validation.py b/src/cleanlab_codex/response_validation.py index e3bf78a..d46a5e0 100644 --- a/src/cleanlab_codex/response_validation.py +++ b/src/cleanlab_codex/response_validation.py @@ -1,4 +1,6 @@ """ +This module is now superseded by this [Validator API](/codex/api/validator/). + Validation functions for evaluating LLM responses and determining if they should be replaced with Codex-generated alternatives. """ From 0a2164907003ffaaf5bc8d897bf4762fe1e34df1 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 26 Mar 2025 11:28:38 -0700 Subject: [PATCH 17/35] add async query to improve latency --- src/cleanlab_codex/validator.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index e4238a6..2a44413 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -5,6 +5,7 @@ from __future__ import annotations from typing import TYPE_CHECKING, Any, Callable, Optional, cast +import asyncio from cleanlab_tlm import TrustworthyRAG from pydantic import BaseModel, Field, field_validator @@ -138,10 +139,14 @@ def validate( - 'is_bad_response': True if the response is flagged as potentially bad (when True, a lookup in Codex is performed), False otherwise. - Additional keys: Various keys from a [`ThresholdedTrustworthyRAGScore`](/cleanlab_codex/types/validator/#class-thresholdedtrustworthyragscore) dictionary, with raw scores from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) for each evaluation metric. `is_bad` indicating whether the score is below the threshold. """ + expert_task = asyncio.create_task(self.remediate_async(query)) scores, is_bad_response = self.detect(query, context, response, prompt, form_prompt) - expert_answer = None if is_bad_response: - expert_answer = self.remediate(query) + expert_answer, maybe_entry = asyncio.run(expert_task) + if expert_answer == None: + self._project.add_entries([maybe_entry]) + else: + expert_answer = None return { "expert_answer": expert_answer, @@ -198,3 +203,7 @@ def remediate(self, query: str) -> str | None: """ codex_answer, _ = self._project.query(question=query) return codex_answer + + async def remediate_async(self, query: str): + codex_answer, entry = self._project.query(question=query, read_only=True) + return codex_answer, entry From c63262540a2ff0843caaa9489137811ee8388aa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Wed, 26 Mar 2025 13:06:53 -0700 Subject: [PATCH 18/35] make remediate method private --- src/cleanlab_codex/validator.py | 4 ++-- tests/test_validator.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index e4238a6..1f8fd93 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -141,7 +141,7 @@ def validate( scores, is_bad_response = self.detect(query, context, response, prompt, form_prompt) expert_answer = None if is_bad_response: - expert_answer = self.remediate(query) + expert_answer = self._remediate(query) return { "expert_answer": expert_answer, @@ -187,7 +187,7 @@ def detect( is_bad_response = any(score_dict["is_bad"] for score_dict in thresholded_scores.values()) return thresholded_scores, is_bad_response - def remediate(self, query: str) -> str | None: + def _remediate(self, query: str) -> str | None: """Request a SME-provided answer for this query, if one is available in Codex. Args: diff --git a/tests/test_validator.py b/tests/test_validator.py index abcff32..cdc2b21 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -133,7 +133,7 @@ def test_remediate(self, mock_project: Mock, mock_trustworthy_rag: Mock) -> None mock_project.from_access_key.return_value.query.return_value = ("expert answer", None) validator = Validator(codex_access_key="test") - result = validator.remediate("test query") + result = validator._remediate("test query") # Verify project.query was called mock_project.from_access_key.return_value.query.assert_called_once_with(question="test query") From d422bcf9f55448deb7f1999f3acf05265bd785e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Wed, 26 Mar 2025 13:11:14 -0700 Subject: [PATCH 19/35] update docstrings --- src/cleanlab_codex/validator.py | 50 ++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 1f8fd93..e9949a2 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -23,6 +23,11 @@ class BadResponseThresholds(BaseModel): """Config for determining if a response is bad. Each key is an evaluation metric and the value is a threshold such that if the score is below the threshold, the response is bad. + + Default Thresholds: + - trustworthiness: 0.5 + - response_helpfulness: 0.5 + - Any custom eval: 0.5 (if not explicitly specified in bad_response_thresholds) """ trustworthiness: float = Field( @@ -82,15 +87,41 @@ def __init__( trustworthy_rag_config: Optional[dict[str, Any]] = None, bad_response_thresholds: Optional[dict[str, float]] = None, ): - """Evaluates the quality of responses generated in RAG applications and remediates them if needed. + """Real-time detection and remediation of bad responses in RAG applications, powered by Cleanlab's TrustworthyRAG and Codex. - This object combines Cleanlab's various Evals with thresholding to detect bad responses and remediates them with Codex. + This object combines Cleanlab's TrustworthyRAG evaluation scores with configurable thresholds to detect potentially bad responses + in your RAG application. When a bad response is detected, it automatically attempts to remediate by retrieving an expert-provided + answer from your Codex project. + + For most use cases, we recommend using the `validate()` method which provides a complete validation workflow including + both detection and Codex remediation. The `detect()` method is available separately for testing and threshold tuning purposes + without triggering a Codex lookup. + + By default, this uses the same default configurations as [`TrustworthyRAG`](/tlm/api/python/utils.rag/#class-trustworthyrag), except: + - Explanations are returned in logs for better debugging + - Only the `response_helpfulness` eval is run Args: - codex_access_key (str): The [access key](/codex/web_tutorials/create_project/#access-keys) for a Codex project. - tlm_api_key (Optional[str]): The API key for [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag). - trustworthy_rag_config (Optional[dict[str, Any]]): Optional initialization arguments for [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag), which is used to detect response issues. - bad_response_thresholds (Optional[dict[str, float]]): Detection score thresholds used to flag whether or not a response is considered bad. Each key in this dict corresponds to an Eval from TrustworthyRAG, and the value indicates a threshold below which scores from this Eval are considered detected issues. A response is flagged as bad if any issues are detected for it. + codex_access_key (str): The [access key](/codex/web_tutorials/create_project/#access-keys) for a Codex project. Used to retrieve expert-provided answers + when bad responses are detected. + + tlm_api_key (str, optional): API key for accessing [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag). If not provided, this must be specified + in trustworthy_rag_config. + + trustworthy_rag_config (dict[str, Any], optional): Optional initialization arguments for [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag), + which is used to detect response issues. If not provided, default configuration will be used. + + bad_response_thresholds (dict[str, float], optional): Detection score thresholds used to flag whether + a response is considered bad. Each key corresponds to an Eval from TrustworthyRAG, and the value + indicates a threshold (between 0 and 1) below which scores are considered detected issues. A response + is flagged as bad if any issues are detected. If not provided, default thresholds will be used. See + [`BadResponseThresholds`](/codex/api/python/validator/#class-badresponsethresholds) for more details. + + Raises: + ValueError: If both tlm_api_key and api_key in trustworthy_rag_config are provided. + ValueError: If bad_response_thresholds contains thresholds for non-existent evaluation metrics. + TypeError: If any threshold value is not a number. + ValueError: If any threshold value is not between 0 and 1. """ trustworthy_rag_config = trustworthy_rag_config or get_default_trustworthyrag_config() if tlm_api_key is not None and "api_key" in trustworthy_rag_config: @@ -157,7 +188,12 @@ def detect( prompt: Optional[str] = None, form_prompt: Optional[Callable[[str, str], str]] = None, ) -> tuple[ThresholdedTrustworthyRAGScore, bool]: - """Evaluate the response quality using TrustworthyRAG and determine if it is a bad response via thresholding. + """Score response quality using TrustworthyRAG and flag bad responses based on configured thresholds. + + Note: + This method is primarily intended for testing and threshold tuning purposes. For production use cases, + we recommend using the `validate()` method which provides a complete validation workflow including + Codex remediation. Args: query (str): The user query that was used to generate the response. From d7bc592376c0a3db934cfbd7973dbdf77f1fe4b5 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 26 Mar 2025 13:35:37 -0700 Subject: [PATCH 20/35] revert and wait outside --- src/cleanlab_codex/validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 2a44413..8888d5d 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -141,8 +141,8 @@ def validate( """ expert_task = asyncio.create_task(self.remediate_async(query)) scores, is_bad_response = self.detect(query, context, response, prompt, form_prompt) + expert_answer, maybe_entry = asyncio.run(expert_task) if is_bad_response: - expert_answer, maybe_entry = asyncio.run(expert_task) if expert_answer == None: self._project.add_entries([maybe_entry]) else: From 2407b8807511076860496c63fa4240143a6b8d98 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 26 Mar 2025 13:42:19 -0700 Subject: [PATCH 21/35] add event lopping --- src/cleanlab_codex/validator.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 8888d5d..49df21b 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -139,12 +139,16 @@ def validate( - 'is_bad_response': True if the response is flagged as potentially bad (when True, a lookup in Codex is performed), False otherwise. - Additional keys: Various keys from a [`ThresholdedTrustworthyRAGScore`](/cleanlab_codex/types/validator/#class-thresholdedtrustworthyragscore) dictionary, with raw scores from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) for each evaluation metric. `is_bad` indicating whether the score is below the threshold. """ - expert_task = asyncio.create_task(self.remediate_async(query)) + loop = asyncio.get_event_loop() + expert_task = loop.create_task(self.remediate_async(query)) scores, is_bad_response = self.detect(query, context, response, prompt, form_prompt) - expert_answer, maybe_entry = asyncio.run(expert_task) + expert_answer, maybe_entry = loop.run_until_complete(expert_task) + if is_bad_response: if expert_answer == None: - self._project.add_entries([maybe_entry]) + self._project._sdk_client.projects.entries.add_question( + self._project._id, question=query, + ).model_dump() else: expert_answer = None From 0ac8e5dc202384f7b0da88abf6fb2612b702fdcb Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 26 Mar 2025 14:44:36 -0700 Subject: [PATCH 22/35] add thread correctly --- src/cleanlab_codex/validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 49df21b..6cd13b6 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -141,9 +141,9 @@ def validate( """ loop = asyncio.get_event_loop() expert_task = loop.create_task(self.remediate_async(query)) - scores, is_bad_response = self.detect(query, context, response, prompt, form_prompt) + detect_task = loop.run_in_executor(None, self.detect, query, context, response, prompt, form_prompt) expert_answer, maybe_entry = loop.run_until_complete(expert_task) - + scores, is_bad_response = loop.run_until_complete(detect_task) if is_bad_response: if expert_answer == None: self._project._sdk_client.projects.entries.add_question( From 94c626a603e76b15dd2a8bed119d35538c4d8275 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 26 Mar 2025 14:52:11 -0700 Subject: [PATCH 23/35] add try catch --- src/cleanlab_codex/validator.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 6cd13b6..5c9547d 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -139,11 +139,16 @@ def validate( - 'is_bad_response': True if the response is flagged as potentially bad (when True, a lookup in Codex is performed), False otherwise. - Additional keys: Various keys from a [`ThresholdedTrustworthyRAGScore`](/cleanlab_codex/types/validator/#class-thresholdedtrustworthyragscore) dictionary, with raw scores from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) for each evaluation metric. `is_bad` indicating whether the score is below the threshold. """ - loop = asyncio.get_event_loop() + try: + loop = asyncio.get_running_loop() + except RuntimeError: # No running loop + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) expert_task = loop.create_task(self.remediate_async(query)) detect_task = loop.run_in_executor(None, self.detect, query, context, response, prompt, form_prompt) expert_answer, maybe_entry = loop.run_until_complete(expert_task) scores, is_bad_response = loop.run_until_complete(detect_task) + loop.close() if is_bad_response: if expert_answer == None: self._project._sdk_client.projects.entries.add_question( From 86707d92688ecda35db0a2ce68e670edc6e239a6 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 26 Mar 2025 17:26:08 -0700 Subject: [PATCH 24/35] Update validator.py --- src/cleanlab_codex/validator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index abf2ffd..21331c5 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -182,6 +182,7 @@ def validate( loop.close() if is_bad_response: if expert_answer == None: + # TODO: Make this async as well self._project._sdk_client.projects.entries.add_question( self._project._id, question=query, ).model_dump() From 0f1b8388af7d861c2dd2c03c5af962d11bcd3bba Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Tue, 1 Apr 2025 17:55:13 -0600 Subject: [PATCH 25/35] docstring --- src/cleanlab_codex/validator.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index feebc89..e4588a3 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -101,25 +101,19 @@ def validate( ) -> dict[str, Any]: """Evaluate whether the AI-generated response is bad, and if so, request an alternate expert answer. If no expert answer is available, this query is still logged for SMEs to answer. - prompt (str, optional): Optional prompt representing the actual inputs (combining query, context, and system instructions into one string) to the LLM that generated the response. - form_prompt (Callable[[str, str], str], optional): Optional function to format the prompt based on query and context. Cannot be provided together with prompt, provide one or the other. This function should take query and context as parameters and return a formatted prompt string. If not provided, a default prompt formatter will be used. To include a system prompt or any other special instructions for your LLM, incorporate them directly in your custom form_prompt() function definition. - - Returns: - dict[str, Any]: A dictionary containing: - - 'expert_answer': Alternate SME-provided answer from Codex if the response was flagged as bad and an answer was found in the Codex Project, or None otherwise. - - 'is_bad_response': True if the response is flagged as potentially bad, False otherwise. When True, a Codex lookup is performed, which logs this query into the Codex Project for SMEs to answer. - - Additional keys from a [`ThresholdedTrustworthyRAGScore`](/codex/api/python/types.validator/#class-thresholdedtrustworthyragscore) dictionary: each corresponds to a [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) evaluation metric, and points to the score for this evaluation as well as a boolean `is_bad` flagging whether the score falls below the corresponding threshold. Args: query (str): The user query that was used to generate the response. context (str): The context that was retrieved from the RAG Knowledge Base and used to generate the response. response (str): A reponse from your LLM/RAG system. + prompt (str, optional): Optional prompt representing the actual inputs (combining query, context, and system instructions into one string) to the LLM that generated the response. + form_prompt (Callable[[str, str], str], optional): Optional function to format the prompt based on query and context. Cannot be provided together with prompt, provide one or the other. This function should take query and context as parameters and return a formatted prompt string. If not provided, a default prompt formatter will be used. To include a system prompt or any other special instructions for your LLM, incorporate them directly in your custom form_prompt() function definition. Returns: dict[str, Any]: A dictionary containing: - - 'expert_answer': Alternate SME-provided answer from Codex if the response was flagged as bad and an answer was found, or None otherwise. - - 'is_bad_response': True if the response is flagged as potentially bad (when True, a lookup in Codex is performed), False otherwise. - - Additional keys: Various keys from a [`ThresholdedTrustworthyRAGScore`](/cleanlab_codex/types/validator/#class-thresholdedtrustworthyragscore) dictionary, with raw scores from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) for each evaluation metric. `is_bad` indicating whether the score is below the threshold. + - 'expert_answer': Alternate SME-provided answer from Codex if the response was flagged as bad and an answer was found in the Codex Project, or None otherwise. + - 'is_bad_response': True if the response is flagged as potentially bad, False otherwise. When True, a Codex lookup is performed, which logs this query into the Codex Project for SMEs to answer. + - Additional keys from a [`ThresholdedTrustworthyRAGScore`](/codex/api/python/types.validator/#class-thresholdedtrustworthyragscore) dictionary: each corresponds to a [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) evaluation metric, and points to the score for this evaluation as well as a boolean `is_bad` flagging whether the score falls below the corresponding threshold. """ try: loop = asyncio.get_running_loop() From 25568339e4f296cbf67c64bed65624b0fe8ebdc0 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Tue, 1 Apr 2025 17:56:13 -0600 Subject: [PATCH 26/35] add tab to docstring --- src/cleanlab_codex/validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index e4588a3..6f40cd8 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -111,7 +111,7 @@ def validate( Returns: dict[str, Any]: A dictionary containing: - - 'expert_answer': Alternate SME-provided answer from Codex if the response was flagged as bad and an answer was found in the Codex Project, or None otherwise. + - 'expert_answer': Alternate SME-provided answer from Codex if the response was flagged as bad and an answer was found in the Codex Project, or None otherwise. - 'is_bad_response': True if the response is flagged as potentially bad, False otherwise. When True, a Codex lookup is performed, which logs this query into the Codex Project for SMEs to answer. - Additional keys from a [`ThresholdedTrustworthyRAGScore`](/codex/api/python/types.validator/#class-thresholdedtrustworthyragscore) dictionary: each corresponds to a [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) evaluation metric, and points to the score for this evaluation as well as a boolean `is_bad` flagging whether the score falls below the corresponding threshold. """ From cee4f137e5674ea1230e2a46f70d73eec3aaecd5 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Tue, 1 Apr 2025 18:01:42 -0600 Subject: [PATCH 27/35] add bool run_async --- src/cleanlab_codex/validator.py | 44 +++++++++++++++++---------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 6f40cd8..8c9c052 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -98,6 +98,7 @@ def validate( response: str, prompt: Optional[str] = None, form_prompt: Optional[Callable[[str, str], str]] = None, + run_async: bool = False, ) -> dict[str, Any]: """Evaluate whether the AI-generated response is bad, and if so, request an alternate expert answer. If no expert answer is available, this query is still logged for SMEs to answer. @@ -115,29 +116,30 @@ def validate( - 'is_bad_response': True if the response is flagged as potentially bad, False otherwise. When True, a Codex lookup is performed, which logs this query into the Codex Project for SMEs to answer. - Additional keys from a [`ThresholdedTrustworthyRAGScore`](/codex/api/python/types.validator/#class-thresholdedtrustworthyragscore) dictionary: each corresponds to a [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) evaluation metric, and points to the score for this evaluation as well as a boolean `is_bad` flagging whether the score falls below the corresponding threshold. """ - try: - loop = asyncio.get_running_loop() - except RuntimeError: # No running loop - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - expert_task = loop.create_task(self.remediate_async(query)) - detect_task = loop.run_in_executor(None, self.detect, query, context, response, prompt, form_prompt) - expert_answer, maybe_entry = loop.run_until_complete(expert_task) - scores, is_bad_response = loop.run_until_complete(detect_task) - loop.close() - if is_bad_response: - if expert_answer == None: - # TODO: Make this async as well - self._project._sdk_client.projects.entries.add_question( - self._project._id, question=query, - ).model_dump() + if run_async: + try: + loop = asyncio.get_running_loop() + except RuntimeError: # No running loop + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + expert_task = loop.create_task(self.remediate_async(query)) + detect_task = loop.run_in_executor(None, self.detect, query, context, response, prompt, form_prompt) + expert_answer, maybe_entry = loop.run_until_complete(expert_task) + scores, is_bad_response = loop.run_until_complete(detect_task) + loop.close() + if is_bad_response: + if expert_answer == None: + # TODO: Make this async as well + self._project._sdk_client.projects.entries.add_question( + self._project._id, question=query, + ).model_dump() + else: + expert_answer = None else: + scores, is_bad_response = self.detect(query, context, response, prompt, form_prompt) expert_answer = None - - scores, is_bad_response = self.detect(query, context, response, prompt, form_prompt) - expert_answer = None - if is_bad_response: - expert_answer = self._remediate(query) + if is_bad_response: + expert_answer = self._remediate(query) return { "expert_answer": expert_answer, From 84cc0f7a3491f741b5c45d29b95937a629637163 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 2 Apr 2025 09:29:01 -0600 Subject: [PATCH 28/35] linting --- src/cleanlab_codex/validator.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 8c9c052..de24d9e 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Callable, Optional, cast +from typing import TYPE_CHECKING, Any, Callable, Optional, cast, Tuple import asyncio @@ -18,6 +18,8 @@ from cleanlab_codex.internal.validator import update_scores_based_on_thresholds as _update_scores_based_on_thresholds from cleanlab_codex.project import Project +from src.cleanlab_codex.types.entry import Entry + if TYPE_CHECKING: from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore @@ -128,7 +130,7 @@ def validate( scores, is_bad_response = loop.run_until_complete(detect_task) loop.close() if is_bad_response: - if expert_answer == None: + if expert_answer is None: # TODO: Make this async as well self._project._sdk_client.projects.entries.add_question( self._project._id, question=query, @@ -204,7 +206,7 @@ def _remediate(self, query: str) -> str | None: codex_answer, _ = self._project.query(question=query) return codex_answer - async def remediate_async(self, query: str): + async def remediate_async(self, query: str) -> Tuple[Optional[str], Optional[Entry]]: codex_answer, entry = self._project.query(question=query, read_only=True) return codex_answer, entry From 640a194351ced42ad457818b26908178100bad61 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 2 Apr 2025 09:37:04 -0600 Subject: [PATCH 29/35] typing --- src/cleanlab_codex/validator.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index de24d9e..cd7ad6f 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -4,9 +4,8 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Callable, Optional, cast, Tuple import asyncio - +from typing import TYPE_CHECKING, Any, Callable, Optional, Tuple, cast from cleanlab_tlm import TrustworthyRAG from pydantic import BaseModel, Field, field_validator @@ -18,10 +17,9 @@ from cleanlab_codex.internal.validator import update_scores_based_on_thresholds as _update_scores_based_on_thresholds from cleanlab_codex.project import Project -from src.cleanlab_codex.types.entry import Entry - if TYPE_CHECKING: from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore + from src.cleanlab_codex.types.entry import Entry class Validator: From 158e1b22323e79e37ba1a710074d8b1ef11a71f7 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 2 Apr 2025 09:39:17 -0600 Subject: [PATCH 30/35] entry fix --- src/cleanlab_codex/validator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index cd7ad6f..0632f85 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -19,7 +19,8 @@ if TYPE_CHECKING: from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore - from src.cleanlab_codex.types.entry import Entry + from cleanlab_codex.types.entry import Entry + class Validator: From c4330fd52256bdea2756127a71928a97e03d45d5 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 2 Apr 2025 09:45:12 -0600 Subject: [PATCH 31/35] format fix --- src/cleanlab_codex/validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 0632f85..185d81c 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -18,8 +18,8 @@ from cleanlab_codex.project import Project if TYPE_CHECKING: - from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore from cleanlab_codex.types.entry import Entry + from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore From c9e1357edd464d63bf56c069ca80e428e902a0e4 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 2 Apr 2025 09:51:19 -0600 Subject: [PATCH 32/35] add docstring --- src/cleanlab_codex/validator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 185d81c..8bef1a8 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -97,9 +97,9 @@ def validate( query: str, context: str, response: str, + run_async: bool = False, prompt: Optional[str] = None, form_prompt: Optional[Callable[[str, str], str]] = None, - run_async: bool = False, ) -> dict[str, Any]: """Evaluate whether the AI-generated response is bad, and if so, request an alternate expert answer. If no expert answer is available, this query is still logged for SMEs to answer. @@ -108,6 +108,7 @@ def validate( query (str): The user query that was used to generate the response. context (str): The context that was retrieved from the RAG Knowledge Base and used to generate the response. response (str): A reponse from your LLM/RAG system. + run_async (bool): If True, runs detect asynchronously prompt (str, optional): Optional prompt representing the actual inputs (combining query, context, and system instructions into one string) to the LLM that generated the response. form_prompt (Callable[[str, str], str], optional): Optional function to format the prompt based on query and context. Cannot be provided together with prompt, provide one or the other. This function should take query and context as parameters and return a formatted prompt string. If not provided, a default prompt formatter will be used. To include a system prompt or any other special instructions for your LLM, incorporate them directly in your custom form_prompt() function definition. From 63d2614bb27caecb38afa8bffc5af8b2dd749342 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 2 Apr 2025 09:57:13 -0600 Subject: [PATCH 33/35] simpler cod --- src/cleanlab_codex/validator.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 8bef1a8..2cd1d8a 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -97,6 +97,7 @@ def validate( query: str, context: str, response: str, + *, run_async: bool = False, prompt: Optional[str] = None, form_prompt: Optional[Callable[[str, str], str]] = None, @@ -132,8 +133,10 @@ def validate( if is_bad_response: if expert_answer is None: # TODO: Make this async as well - self._project._sdk_client.projects.entries.add_question( - self._project._id, question=query, + project_id = self._project._id + question_entry = self._project._sdk_client.projects.entries.add_question( + project_id, + question=query, ).model_dump() else: expert_answer = None From bc45c23f9eaf547be635c9c6cd619bf527e90711 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 2 Apr 2025 10:12:31 -0600 Subject: [PATCH 34/35] noqa --- src/cleanlab_codex/validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 2cd1d8a..0a562de 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -133,8 +133,8 @@ def validate( if is_bad_response: if expert_answer is None: # TODO: Make this async as well - project_id = self._project._id - question_entry = self._project._sdk_client.projects.entries.add_question( + project_id = self._project._id # noqa: SLF001 + self._project._sdk_client.projects.entries.add_question( # noqa: SLF001 project_id, question=query, ).model_dump() From acb3beb63b37efa780cbbc3b3ddaa7f923ae08c2 Mon Sep 17 00:00:00 2001 From: Aditya Thyagarajan Date: Wed, 2 Apr 2025 10:32:35 -0600 Subject: [PATCH 35/35] linting --- src/cleanlab_codex/validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index 0a562de..8921d8e 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -22,7 +22,6 @@ from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore - class Validator: def __init__( self, @@ -213,6 +212,7 @@ async def remediate_async(self, query: str) -> Tuple[Optional[str], Optional[Ent codex_answer, entry = self._project.query(question=query, read_only=True) return codex_answer, entry + class BadResponseThresholds(BaseModel): """Config for determining if a response is bad. Each key is an evaluation metric and the value is a threshold such that a response is considered bad whenever the corresponding evaluation score falls below the threshold.