diff --git a/src/strands_evals/evaluators/faithfulness_evaluator.py b/src/strands_evals/evaluators/faithfulness_evaluator.py index bf79b23..8a5c9a2 100644 --- a/src/strands_evals/evaluators/faithfulness_evaluator.py +++ b/src/strands_evals/evaluators/faithfulness_evaluator.py @@ -1,4 +1,5 @@ from enum import Enum +from typing import cast from pydantic import BaseModel, Field from strands import Agent @@ -59,29 +60,33 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva parsed_input = self._get_last_turn(evaluation_case) prompt = self._format_prompt(parsed_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - rating = evaluator_agent.structured_output(FaithfulnessRating, prompt) + result = evaluator_agent(prompt, structured_output_model=FaithfulnessRating) + rating = cast(FaithfulnessRating, result.structured_output) normalized_score = self._score_mapping[rating.score] - result = EvaluationOutput( - score=normalized_score, - test_pass=normalized_score >= 0.5, - reason=rating.reasoning, - label=rating.score, - ) - return [result] + return [ + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score >= 0.5, + reason=rating.reasoning, + label=rating.score, + ) + ] async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: parsed_input = self._get_last_turn(evaluation_case) prompt = self._format_prompt(parsed_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - rating = await evaluator_agent.structured_output_async(FaithfulnessRating, prompt) + result = await evaluator_agent.invoke_async(prompt, structured_output_model=FaithfulnessRating) + rating = cast(FaithfulnessRating, result.structured_output) normalized_score = self._score_mapping[rating.score] - result = EvaluationOutput( - score=normalized_score, - test_pass=normalized_score >= 0.5, - reason=rating.reasoning, - label=rating.score, - ) - return [result] + return [ + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score >= 0.5, + reason=rating.reasoning, + label=rating.score, + ) + ] def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput: """Extract the most recent turn from the conversation for evaluation.""" diff --git a/src/strands_evals/evaluators/goal_success_rate_evaluator.py b/src/strands_evals/evaluators/goal_success_rate_evaluator.py index 93a3bf5..85896bb 100644 --- a/src/strands_evals/evaluators/goal_success_rate_evaluator.py +++ b/src/strands_evals/evaluators/goal_success_rate_evaluator.py @@ -1,4 +1,5 @@ from enum import Enum +from typing import cast from pydantic import BaseModel, Field from strands import Agent @@ -53,29 +54,33 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva session_input = self._parse_trajectory(evaluation_case) prompt = self._format_prompt(session_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - rating = evaluator_agent.structured_output(GoalSuccessRating, prompt) + result = evaluator_agent(prompt, structured_output_model=GoalSuccessRating) + rating = cast(GoalSuccessRating, result.structured_output) normalized_score = self._score_mapping[rating.score] - result = EvaluationOutput( - score=normalized_score, - test_pass=normalized_score >= 1.0, - reason=rating.reasoning, - label=rating.score, - ) - return [result] + return [ + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score >= 1.0, + reason=rating.reasoning, + label=rating.score, + ) + ] async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: session_input = self._parse_trajectory(evaluation_case) prompt = self._format_prompt(session_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt) + result = await evaluator_agent.invoke_async(prompt, structured_output_model=GoalSuccessRating) + rating = cast(GoalSuccessRating, result.structured_output) normalized_score = self._score_mapping[rating.score] - result = EvaluationOutput( - score=normalized_score, - test_pass=normalized_score >= 1.0, - reason=rating.reasoning, - label=rating.score, - ) - return [result] + return [ + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score >= 1.0, + reason=rating.reasoning, + label=rating.score, + ) + ] def _format_prompt(self, session_input: SessionLevelInput) -> str: """Format evaluation prompt from session-level input.""" diff --git a/src/strands_evals/evaluators/harmfulness_evaluator.py b/src/strands_evals/evaluators/harmfulness_evaluator.py index 07c7281..3719f2a 100644 --- a/src/strands_evals/evaluators/harmfulness_evaluator.py +++ b/src/strands_evals/evaluators/harmfulness_evaluator.py @@ -1,4 +1,5 @@ from enum import Enum +from typing import cast from pydantic import BaseModel, Field from strands import Agent @@ -52,29 +53,33 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva parsed_input = self._get_last_turn(evaluation_case) prompt = self._format_prompt(parsed_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - rating = evaluator_agent.structured_output(HarmfulnessRating, prompt) + result = evaluator_agent(prompt, structured_output_model=HarmfulnessRating) + rating = cast(HarmfulnessRating, result.structured_output) normalized_score = self._score_mapping[rating.score] - result = EvaluationOutput( - score=normalized_score, - test_pass=normalized_score == 1.0, - reason=rating.reasoning, - label=rating.score, - ) - return [result] + return [ + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score == 1.0, + reason=rating.reasoning, + label=rating.score, + ) + ] async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: parsed_input = self._get_last_turn(evaluation_case) prompt = self._format_prompt(parsed_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - rating = await evaluator_agent.structured_output_async(HarmfulnessRating, prompt) + result = await evaluator_agent.invoke_async(prompt, structured_output_model=HarmfulnessRating) + rating = cast(HarmfulnessRating, result.structured_output) normalized_score = self._score_mapping[rating.score] - result = EvaluationOutput( - score=normalized_score, - test_pass=normalized_score == 1.0, - reason=rating.reasoning, - label=rating.score, - ) - return [result] + return [ + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score == 1.0, + reason=rating.reasoning, + label=rating.score, + ) + ] def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput: """Extract the most recent turn from the conversation for evaluation.""" diff --git a/src/strands_evals/evaluators/helpfulness_evaluator.py b/src/strands_evals/evaluators/helpfulness_evaluator.py index c2e893a..2fb487b 100644 --- a/src/strands_evals/evaluators/helpfulness_evaluator.py +++ b/src/strands_evals/evaluators/helpfulness_evaluator.py @@ -1,4 +1,5 @@ from enum import Enum +from typing import cast from pydantic import BaseModel, Field from strands import Agent @@ -65,29 +66,33 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva parsed_input = self._get_last_turn(evaluation_case) prompt = self._format_prompt(parsed_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - rating = evaluator_agent.structured_output(HelpfulnessRating, prompt) + result = evaluator_agent(prompt, structured_output_model=HelpfulnessRating) + rating = cast(HelpfulnessRating, result.structured_output) normalized_score = self._score_mapping[rating.score] - result = EvaluationOutput( - score=normalized_score, - test_pass=normalized_score >= 0.5, - reason=rating.reasoning, - label=rating.score, - ) - return [result] + return [ + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score >= 0.5, + reason=rating.reasoning, + label=rating.score, + ) + ] async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: parsed_input = self._get_last_turn(evaluation_case) prompt = self._format_prompt(parsed_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - rating = await evaluator_agent.structured_output_async(HelpfulnessRating, prompt) + result = await evaluator_agent.invoke_async(prompt, structured_output_model=HelpfulnessRating) + rating = cast(HelpfulnessRating, result.structured_output) normalized_score = self._score_mapping[rating.score] - result = EvaluationOutput( - score=normalized_score, - test_pass=normalized_score >= 0.5, - reason=rating.reasoning, - label=rating.score, - ) - return [result] + return [ + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score >= 0.5, + reason=rating.reasoning, + label=rating.score, + ) + ] def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput: """Extract the most recent turn from the conversation for evaluation.""" diff --git a/src/strands_evals/evaluators/interactions_evaluator.py b/src/strands_evals/evaluators/interactions_evaluator.py index ba2b59a..76f83f1 100644 --- a/src/strands_evals/evaluators/interactions_evaluator.py +++ b/src/strands_evals/evaluators/interactions_evaluator.py @@ -1,3 +1,5 @@ +from typing import cast + from strands import Agent from strands.agent.conversation_manager import SlidingWindowConversationManager from strands.models.model import Model @@ -198,8 +200,8 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva for i in range(num_interactions): is_last = i == num_interactions - 1 evaluation_prompt = self._compose_prompt(evaluation_case, i, is_last) - result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt) - results.append(result) + result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput) + results.append(cast(EvaluationOutput, result.structured_output)) return results @@ -238,7 +240,7 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) for i in range(num_interactions): is_last = i == num_interactions - 1 evaluation_prompt = self._compose_prompt(evaluation_case, i, is_last) - result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt) - results.append(result) + result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput) + results.append(cast(EvaluationOutput, result.structured_output)) return results diff --git a/src/strands_evals/evaluators/output_evaluator.py b/src/strands_evals/evaluators/output_evaluator.py index 34a60ca..1da9341 100644 --- a/src/strands_evals/evaluators/output_evaluator.py +++ b/src/strands_evals/evaluators/output_evaluator.py @@ -1,3 +1,5 @@ +from typing import cast + from strands import Agent from strands.models.model import Model from typing_extensions import TypeVar, Union @@ -51,8 +53,8 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva evaluation_prompt = compose_test_prompt( evaluation_case=evaluation_case, rubric=self.rubric, include_inputs=self.include_inputs ) - result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt) - return [result] + result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput) + return [cast(EvaluationOutput, result.structured_output)] async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: """ @@ -68,5 +70,5 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) evaluation_prompt = compose_test_prompt( evaluation_case=evaluation_case, rubric=self.rubric, include_inputs=self.include_inputs ) - result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt) - return [result] + result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput) + return [cast(EvaluationOutput, result.structured_output)] diff --git a/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py b/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py index 7a1b938..63e8ac8 100644 --- a/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +++ b/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py @@ -1,4 +1,5 @@ from enum import Enum +from typing import cast from pydantic import BaseModel, Field from strands import Agent @@ -56,12 +57,17 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva for tool_input in tool_inputs: prompt = self._format_prompt(tool_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - rating = evaluator_agent.structured_output(ToolParameterAccuracyRating, prompt) + result = evaluator_agent(prompt, structured_output_model=ToolParameterAccuracyRating) + rating = cast(ToolParameterAccuracyRating, result.structured_output) normalized_score = self._score_mapping[rating.score] - result = EvaluationOutput( - score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score + results.append( + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score == 1.0, + reason=rating.reasoning, + label=rating.score, + ) ) - results.append(result) return results @@ -72,12 +78,17 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) for tool_input in tool_inputs: prompt = self._format_prompt(tool_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - rating = await evaluator_agent.structured_output_async(ToolParameterAccuracyRating, prompt) + result = await evaluator_agent.invoke_async(prompt, structured_output_model=ToolParameterAccuracyRating) + rating = cast(ToolParameterAccuracyRating, result.structured_output) normalized_score = self._score_mapping[rating.score] - result = EvaluationOutput( - score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score + results.append( + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score == 1.0, + reason=rating.reasoning, + label=rating.score, + ) ) - results.append(result) return results diff --git a/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py b/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py index 95807a8..3904562 100644 --- a/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py +++ b/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py @@ -1,4 +1,5 @@ from enum import Enum +from typing import cast from pydantic import BaseModel, Field from strands import Agent @@ -56,12 +57,17 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva for tool_input in tool_inputs: prompt = self._format_prompt(tool_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - rating = evaluator_agent.structured_output(ToolSelectionRating, prompt) + result = evaluator_agent(prompt, structured_output_model=ToolSelectionRating) + rating = cast(ToolSelectionRating, result.structured_output) normalized_score = self._score_mapping[rating.score] - result = EvaluationOutput( - score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score + results.append( + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score == 1.0, + reason=rating.reasoning, + label=rating.score, + ) ) - results.append(result) return results @@ -72,12 +78,17 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) for tool_input in tool_inputs: prompt = self._format_prompt(tool_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - rating = await evaluator_agent.structured_output_async(ToolSelectionRating, prompt) + result = await evaluator_agent.invoke_async(prompt, structured_output_model=ToolSelectionRating) + rating = cast(ToolSelectionRating, result.structured_output) normalized_score = self._score_mapping[rating.score] - result = EvaluationOutput( - score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score + results.append( + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score == 1.0, + reason=rating.reasoning, + label=rating.score, + ) ) - results.append(result) return results diff --git a/src/strands_evals/evaluators/trajectory_evaluator.py b/src/strands_evals/evaluators/trajectory_evaluator.py index 76d47f7..049c85b 100644 --- a/src/strands_evals/evaluators/trajectory_evaluator.py +++ b/src/strands_evals/evaluators/trajectory_evaluator.py @@ -1,3 +1,5 @@ +from typing import cast + from strands import Agent from strands.models.model import Model from typing_extensions import Any, TypeVar, Union @@ -74,8 +76,8 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva include_inputs=self.include_inputs, uses_trajectory=True, ) - result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt) - return [result] + result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput) + return [cast(EvaluationOutput, result.structured_output)] async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: """ @@ -96,5 +98,5 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) include_inputs=self.include_inputs, uses_trajectory=True, ) - result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt) - return [result] + result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput) + return [cast(EvaluationOutput, result.structured_output)] diff --git a/tests/strands_evals/evaluators/test_faithfulness_evaluator.py b/tests/strands_evals/evaluators/test_faithfulness_evaluator.py index 85465ad..90b656a 100644 --- a/tests/strands_evals/evaluators/test_faithfulness_evaluator.py +++ b/tests/strands_evals/evaluators/test_faithfulness_evaluator.py @@ -50,9 +50,11 @@ def test_init_with_custom_values(): @patch("strands_evals.evaluators.faithfulness_evaluator.Agent") def test_evaluate(mock_agent_class, evaluation_data): mock_agent = Mock() - mock_agent.structured_output.return_value = FaithfulnessRating( + mock_result = Mock() + mock_result.structured_output = FaithfulnessRating( reasoning="The response is faithful", score=FaithfulnessScore.COMPLETELY_YES ) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = FaithfulnessEvaluator() @@ -78,7 +80,9 @@ def test_evaluate(mock_agent_class, evaluation_data): @patch("strands_evals.evaluators.faithfulness_evaluator.Agent") def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass): mock_agent = Mock() - mock_agent.structured_output.return_value = FaithfulnessRating(reasoning="Test", score=score) + mock_result = Mock() + mock_result.structured_output = FaithfulnessRating(reasoning="Test", score=score) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = FaithfulnessEvaluator() @@ -95,10 +99,14 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, async def test_evaluate_async(mock_agent_class, evaluation_data): mock_agent = Mock() - async def mock_structured_output_async(*args, **kwargs): - return FaithfulnessRating(reasoning="The response is faithful", score=FaithfulnessScore.COMPLETELY_YES) + async def mock_invoke_async(*args, **kwargs): + mock_result = Mock() + mock_result.structured_output = FaithfulnessRating( + reasoning="The response is faithful", score=FaithfulnessScore.COMPLETELY_YES + ) + return mock_result - mock_agent.structured_output_async = mock_structured_output_async + mock_agent.invoke_async = mock_invoke_async mock_agent_class.return_value = mock_agent evaluator = FaithfulnessEvaluator() diff --git a/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py b/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py index 53c0829..86dce93 100644 --- a/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py +++ b/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py @@ -67,9 +67,9 @@ def test_init_with_custom_values(): @patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent") def test_evaluate(mock_agent_class, evaluation_data): mock_agent = Mock() - mock_agent.structured_output.return_value = GoalSuccessRating( - reasoning="All goals achieved", score=GoalSuccessScore.YES - ) + mock_result = Mock() + mock_result.structured_output = GoalSuccessRating(reasoning="All goals achieved", score=GoalSuccessScore.YES) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = GoalSuccessRateEvaluator() @@ -92,7 +92,9 @@ def test_evaluate(mock_agent_class, evaluation_data): @patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent") def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass): mock_agent = Mock() - mock_agent.structured_output.return_value = GoalSuccessRating(reasoning="Test", score=score) + mock_result = Mock() + mock_result.structured_output = GoalSuccessRating(reasoning="Test", score=score) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = GoalSuccessRateEvaluator() @@ -109,10 +111,12 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, async def test_evaluate_async(mock_agent_class, evaluation_data): mock_agent = Mock() - async def mock_structured_output_async(*args, **kwargs): - return GoalSuccessRating(reasoning="All goals achieved", score=GoalSuccessScore.YES) + async def mock_invoke_async(*args, **kwargs): + mock_result = Mock() + mock_result.structured_output = GoalSuccessRating(reasoning="All goals achieved", score=GoalSuccessScore.YES) + return mock_result - mock_agent.structured_output_async = mock_structured_output_async + mock_agent.invoke_async = mock_invoke_async mock_agent_class.return_value = mock_agent evaluator = GoalSuccessRateEvaluator() diff --git a/tests/strands_evals/evaluators/test_harmfulness_evaluator.py b/tests/strands_evals/evaluators/test_harmfulness_evaluator.py index 3ea1199..582a36f 100644 --- a/tests/strands_evals/evaluators/test_harmfulness_evaluator.py +++ b/tests/strands_evals/evaluators/test_harmfulness_evaluator.py @@ -56,9 +56,11 @@ def test_init_with_custom_values(): @patch("strands_evals.evaluators.harmfulness_evaluator.Agent") def test_evaluate(mock_agent_class, evaluation_data): mock_agent = Mock() - mock_agent.structured_output.return_value = HarmfulnessRating( + mock_result = Mock() + mock_result.structured_output = HarmfulnessRating( reasoning="The response is safe and helpful", score=HarmfulnessScore.NOT_HARMFUL ) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = HarmfulnessEvaluator() @@ -81,7 +83,9 @@ def test_evaluate(mock_agent_class, evaluation_data): @patch("strands_evals.evaluators.harmfulness_evaluator.Agent") def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass): mock_agent = Mock() - mock_agent.structured_output.return_value = HarmfulnessRating(reasoning="Test", score=score) + mock_result = Mock() + mock_result.structured_output = HarmfulnessRating(reasoning="Test", score=score) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = HarmfulnessEvaluator() @@ -98,10 +102,14 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, async def test_evaluate_async(mock_agent_class, evaluation_data): mock_agent = Mock() - async def mock_structured_output_async(*args, **kwargs): - return HarmfulnessRating(reasoning="The response is safe and helpful", score=HarmfulnessScore.NOT_HARMFUL) + async def mock_invoke_async(*args, **kwargs): + mock_result = Mock() + mock_result.structured_output = HarmfulnessRating( + reasoning="The response is safe and helpful", score=HarmfulnessScore.NOT_HARMFUL + ) + return mock_result - mock_agent.structured_output_async = mock_structured_output_async + mock_agent.invoke_async = mock_invoke_async mock_agent_class.return_value = mock_agent evaluator = HarmfulnessEvaluator() diff --git a/tests/strands_evals/evaluators/test_helpfulness_evaluator.py b/tests/strands_evals/evaluators/test_helpfulness_evaluator.py index e195d8a..40a8cf6 100644 --- a/tests/strands_evals/evaluators/test_helpfulness_evaluator.py +++ b/tests/strands_evals/evaluators/test_helpfulness_evaluator.py @@ -52,9 +52,11 @@ def test_init_with_custom_values(): @patch("strands_evals.evaluators.helpfulness_evaluator.Agent") def test_evaluate(mock_agent_class, evaluation_data): mock_agent = Mock() - mock_agent.structured_output.return_value = HelpfulnessRating( + mock_result = Mock() + mock_result.structured_output = HelpfulnessRating( reasoning="The response is helpful", score=HelpfulnessScore.VERY_HELPFUL ) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = HelpfulnessEvaluator() @@ -82,7 +84,9 @@ def test_evaluate(mock_agent_class, evaluation_data): @patch("strands_evals.evaluators.helpfulness_evaluator.Agent") def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass): mock_agent = Mock() - mock_agent.structured_output.return_value = HelpfulnessRating(reasoning="Test", score=score) + mock_result = Mock() + mock_result.structured_output = HelpfulnessRating(reasoning="Test", score=score) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = HelpfulnessEvaluator() @@ -99,10 +103,14 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, async def test_evaluate_async(mock_agent_class, evaluation_data): mock_agent = Mock() - async def mock_structured_output_async(*args, **kwargs): - return HelpfulnessRating(reasoning="The response is helpful", score=HelpfulnessScore.VERY_HELPFUL) + async def mock_invoke_async(*args, **kwargs): + mock_result = Mock() + mock_result.structured_output = HelpfulnessRating( + reasoning="The response is helpful", score=HelpfulnessScore.VERY_HELPFUL + ) + return mock_result - mock_agent.structured_output_async = mock_structured_output_async + mock_agent.invoke_async = mock_invoke_async mock_agent_class.return_value = mock_agent evaluator = HelpfulnessEvaluator() diff --git a/tests/strands_evals/evaluators/test_interactions_evaluator.py b/tests/strands_evals/evaluators/test_interactions_evaluator.py index 82ab323..a942ed9 100644 --- a/tests/strands_evals/evaluators/test_interactions_evaluator.py +++ b/tests/strands_evals/evaluators/test_interactions_evaluator.py @@ -10,9 +10,9 @@ def mock_agent(): """Mock Agent for testing""" agent = Mock() - agent.structured_output.return_value = EvaluationOutput( - score=0.85, test_pass=True, reason="Mock interaction evaluation" - ) + mock_result = Mock() + mock_result.structured_output = EvaluationOutput(score=0.85, test_pass=True, reason="Mock interaction evaluation") + agent.return_value = mock_result return agent @@ -21,11 +21,14 @@ def mock_async_agent(): """Mock Agent for testing with async""" agent = Mock() - # Create a mock coroutine function - async def mock_structured_output_async(*args, **kwargs): - return EvaluationOutput(score=0.85, test_pass=True, reason="Mock async interaction evaluation") + async def mock_invoke_async(*args, **kwargs): + mock_result = Mock() + mock_result.structured_output = EvaluationOutput( + score=0.85, test_pass=True, reason="Mock async interaction evaluation" + ) + return mock_result - agent.structured_output_async = mock_structured_output_async + agent.invoke_async = mock_invoke_async return agent @@ -118,7 +121,7 @@ def test_interactions_evaluator_evaluate_with_full_data(mock_agent_class, evalua assert call_kwargs["callback_handler"] is None # Verify structured_output was called twice (for each interaction) - assert mock_agent.structured_output.call_count == 2 + assert mock_agent.call_count == 2 assert result[0].score == 0.85 assert result[0].test_pass is True @@ -132,13 +135,13 @@ def test_interactions_evaluator_evaluate_without_inputs(mock_agent_class, evalua result = evaluator.evaluate(evaluation_data) - # Check that structured_output was called - assert mock_agent.structured_output.call_count == 2 + # Check that agent was called + assert mock_agent.call_count == 2 assert result[0].score == 0.85 assert result[0].test_pass is True - call_args = mock_agent.structured_output.call_args - prompt = call_args[0][1] + call_args = mock_agent.call_args + prompt = call_args[0][0] assert "" not in prompt assert "" not in prompt @@ -202,12 +205,12 @@ def test_interactions_evaluator_evaluate_with_dict_rubric(mock_agent_class, mock result = evaluator.evaluate(evaluation_data) # Verify both interactions were evaluated - assert mock_agent.structured_output.call_count == 2 + assert mock_agent.call_count == 2 # Check that correct rubrics were used in prompts - call_args_list = mock_agent.structured_output.call_args_list - first_prompt = call_args_list[0][0][1] - second_prompt = call_args_list[1][0][1] + call_args_list = mock_agent.call_args_list + first_prompt = call_args_list[0][0][0] + second_prompt = call_args_list[1][0][0] assert "Evaluate planning quality and task breakdown" in first_prompt assert "Assess research thoroughness and accuracy" in second_prompt diff --git a/tests/strands_evals/evaluators/test_output_evaluator.py b/tests/strands_evals/evaluators/test_output_evaluator.py index e7a346b..079f305 100644 --- a/tests/strands_evals/evaluators/test_output_evaluator.py +++ b/tests/strands_evals/evaluators/test_output_evaluator.py @@ -10,7 +10,9 @@ def mock_agent(): """Mock Agent for testing""" agent = Mock() - agent.structured_output.return_value = EvaluationOutput(score=0.8, test_pass=True, reason="Mock evaluation result") + mock_result = Mock() + mock_result.structured_output = EvaluationOutput(score=0.8, test_pass=True, reason="Mock evaluation result") + agent.return_value = mock_result return agent @@ -20,10 +22,14 @@ def mock_async_agent(): agent = Mock() # Create a mock coroutine function - async def mock_structured_output_async(*args, **kwargs): - return EvaluationOutput(score=0.8, test_pass=True, reason="Mock async evaluation result") - - agent.structured_output_async = mock_structured_output_async + async def mock_invoke_async(*args, **kwargs): + mock_result = Mock() + mock_result.structured_output = EvaluationOutput( + score=0.8, test_pass=True, reason="Mock async evaluation result" + ) + return mock_result + + agent.invoke_async = mock_invoke_async return agent @@ -66,11 +72,11 @@ def test_output_evaluator_evaluate_with_inputs(mock_agent_class, evaluation_data # Verify Agent was created with correct parameters mock_agent_class.assert_called_once_with(model=None, system_prompt=evaluator.system_prompt, callback_handler=None) - # Verify structured_output was called - mock_agent.structured_output.assert_called_once() - call_args = mock_agent.structured_output.call_args - assert call_args[0][0] == EvaluationOutput - prompt = call_args[0][1] + # Verify agent was called + mock_agent.assert_called_once() + call_args = mock_agent.call_args + prompt = call_args[0][0] + assert call_args[1]["structured_output_model"] == EvaluationOutput assert "What is 2+2?" in prompt assert "" not in prompt assert "" not in prompt @@ -91,8 +97,8 @@ def test_output_evaluator_evaluate_without_inputs(mock_agent_class, evaluation_d result = evaluator.evaluate(evaluation_data) - call_args = mock_agent.structured_output.call_args - prompt = call_args[0][1] + call_args = mock_agent.call_args + prompt = call_args[0][0] assert "" not in prompt assert "" not in prompt assert "" not in prompt @@ -117,8 +123,8 @@ def test_output_evaluator_evaluate_without_expected_output(mock_agent_class, moc evaluator.evaluate(evaluation_data) - call_args = mock_agent.structured_output.call_args - prompt = call_args[0][1] + call_args = mock_agent.call_args + prompt = call_args[0][0] assert "" not in prompt assert "result" in prompt diff --git a/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py b/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py index 870aaa2..7e2debb 100644 --- a/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py +++ b/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py @@ -77,9 +77,11 @@ def test_init_with_custom_values(): @patch("strands_evals.evaluators.tool_parameter_accuracy_evaluator.Agent") def test_evaluate(mock_agent_class, evaluation_data): mock_agent = Mock() - mock_agent.structured_output.return_value = ToolParameterAccuracyRating( + mock_result = Mock() + mock_result.structured_output = ToolParameterAccuracyRating( reasoning="Parameters are faithful to context", score=ToolParameterAccuracyScore.YES ) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = ToolParameterAccuracyEvaluator() @@ -102,7 +104,9 @@ def test_evaluate(mock_agent_class, evaluation_data): @patch("strands_evals.evaluators.tool_parameter_accuracy_evaluator.Agent") def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass): mock_agent = Mock() - mock_agent.structured_output.return_value = ToolParameterAccuracyRating(reasoning="Test", score=score) + mock_result = Mock() + mock_result.structured_output = ToolParameterAccuracyRating(reasoning="Test", score=score) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = ToolParameterAccuracyEvaluator() @@ -119,12 +123,14 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, async def test_evaluate_async(mock_agent_class, evaluation_data): mock_agent = Mock() - async def mock_structured_output_async(*args, **kwargs): - return ToolParameterAccuracyRating( + async def mock_invoke_async(*args, **kwargs): + mock_result = Mock() + mock_result.structured_output = ToolParameterAccuracyRating( reasoning="Parameters are faithful to context", score=ToolParameterAccuracyScore.YES ) + return mock_result - mock_agent.structured_output_async = mock_structured_output_async + mock_agent.invoke_async = mock_invoke_async mock_agent_class.return_value = mock_agent evaluator = ToolParameterAccuracyEvaluator() diff --git a/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py b/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py index 9a5243b..5955929 100644 --- a/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py +++ b/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py @@ -69,9 +69,11 @@ def test_init_with_custom_values(): @patch("strands_evals.evaluators.tool_selection_accuracy_evaluator.Agent") def test_evaluate(mock_agent_class, evaluation_data): mock_agent = Mock() - mock_agent.structured_output.return_value = ToolSelectionRating( + mock_result = Mock() + mock_result.structured_output = ToolSelectionRating( reasoning="Tool call is appropriate", score=ToolSelectionScore.YES ) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = ToolSelectionAccuracyEvaluator() @@ -94,7 +96,9 @@ def test_evaluate(mock_agent_class, evaluation_data): @patch("strands_evals.evaluators.tool_selection_accuracy_evaluator.Agent") def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass): mock_agent = Mock() - mock_agent.structured_output.return_value = ToolSelectionRating(reasoning="Test", score=score) + mock_result = Mock() + mock_result.structured_output = ToolSelectionRating(reasoning="Test", score=score) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = ToolSelectionAccuracyEvaluator() @@ -111,10 +115,14 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, async def test_evaluate_async(mock_agent_class, evaluation_data): mock_agent = Mock() - async def mock_structured_output_async(*args, **kwargs): - return ToolSelectionRating(reasoning="Tool call is appropriate", score=ToolSelectionScore.YES) + async def mock_invoke_async(*args, **kwargs): + mock_result = Mock() + mock_result.structured_output = ToolSelectionRating( + reasoning="Tool call is appropriate", score=ToolSelectionScore.YES + ) + return mock_result - mock_agent.structured_output_async = mock_structured_output_async + mock_agent.invoke_async = mock_invoke_async mock_agent_class.return_value = mock_agent evaluator = ToolSelectionAccuracyEvaluator() diff --git a/tests/strands_evals/evaluators/test_trajectory_evaluator.py b/tests/strands_evals/evaluators/test_trajectory_evaluator.py index 9d59f67..7adcd05 100644 --- a/tests/strands_evals/evaluators/test_trajectory_evaluator.py +++ b/tests/strands_evals/evaluators/test_trajectory_evaluator.py @@ -10,9 +10,9 @@ def mock_agent(): """Mock Agent for testing""" agent = Mock() - agent.structured_output.return_value = EvaluationOutput( - score=0.9, test_pass=True, reason="Mock trajectory evaluation" - ) + mock_result = Mock() + mock_result.structured_output = EvaluationOutput(score=0.9, test_pass=True, reason="Mock trajectory evaluation") + agent.return_value = mock_result return agent @@ -21,11 +21,14 @@ def mock_async_agent(): """Mock Agent for testing with async""" agent = Mock() - # Create a mock coroutine function - async def mock_structured_output_async(*args, **kwargs): - return EvaluationOutput(score=0.9, test_pass=True, reason="Mock async trajectory evaluation") + async def mock_invoke_async(*args, **kwargs): + mock_result = Mock() + mock_result.structured_output = EvaluationOutput( + score=0.9, test_pass=True, reason="Mock async trajectory evaluation" + ) + return mock_result - agent.structured_output_async = mock_structured_output_async + agent.invoke_async = mock_invoke_async return agent @@ -81,10 +84,10 @@ def test_trajectory_evaluator_evaluate_with_full_data(mock_agent_class, evaluati model=None, system_prompt=evaluator.system_prompt, tools=evaluator._tools, callback_handler=None ) - # Verify structured_output call - mock_agent.structured_output.assert_called_once() - call_args = mock_agent.structured_output.call_args - prompt = call_args[0][1] + # Verify agent call + mock_agent.assert_called_once() + call_args = mock_agent.call_args + prompt = call_args[0][0] assert "What's 2x2?" in prompt assert "2x2 is 4." in prompt @@ -104,8 +107,8 @@ def test_trajectory_evaluator_evaluate_without_inputs(mock_agent_class, evaluati result = evaluator.evaluate(evaluation_data) - call_args = mock_agent.structured_output.call_args - prompt = call_args[0][1] + call_args = mock_agent.call_args + prompt = call_args[0][0] assert "" not in prompt assert "2x2 is 4." in prompt assert "2x2 is 4." in prompt @@ -130,8 +133,8 @@ def test_trajectory_evaluator_evaluate_without_expected_output(mock_agent_class, evaluator.evaluate(evaluation_data) - call_args = mock_agent.structured_output.call_args - prompt = call_args[0][1] + call_args = mock_agent.call_args + prompt = call_args[0][0] assert "" not in prompt assert "result" in prompt assert "['step1', 'step2']" in prompt @@ -152,8 +155,8 @@ def test_trajectory_evaluator_evaluate_without_expected_trajectory(mock_agent_cl evaluator.evaluate(evaluation_data) - call_args = mock_agent.structured_output.call_args - prompt = call_args[0][1] + call_args = mock_agent.call_args + prompt = call_args[0][0] assert "" not in prompt assert "['step1', 'step2']" in prompt @@ -167,8 +170,8 @@ def test_trajectory_evaluator_evaluate_missing_actual_output(mock_agent_class, m evaluator.evaluate(evaluation_data) - call_args = mock_agent.structured_output.call_args - prompt = call_args[0][1] + call_args = mock_agent.call_args + prompt = call_args[0][0] assert "" not in prompt diff --git a/tests/test_integration.py b/tests/test_integration.py index 7a5186b..e3e5a16 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,5 +1,5 @@ import asyncio -from unittest.mock import AsyncMock, Mock, patch +from unittest.mock import Mock, patch import pytest @@ -52,17 +52,27 @@ def interaction_case(): @pytest.fixture def mock_agent(): + """Mock agent for evaluators using agent() call with structured_output_model""" agent = Mock() - agent.structured_output.return_value = EvaluationOutput(score=mock_score, test_pass=True, reason="LLM evaluation") + mock_result = Mock() + mock_result.structured_output = EvaluationOutput(score=mock_score, test_pass=True, reason="LLM evaluation") + agent.return_value = mock_result return agent @pytest.fixture def mock_async_agent(): + """Mock agent for async evaluators""" agent = Mock() - agent.structured_output_async = AsyncMock( - return_value=EvaluationOutput(score=mock_score, test_pass=True, reason="Async LLM evaluation") - ) + + async def mock_invoke_async(*args, **kwargs): + mock_result = Mock() + mock_result.structured_output = EvaluationOutput( + score=mock_score, test_pass=True, reason="Async LLM evaluation" + ) + return mock_result + + agent.invoke_async = mock_invoke_async return agent @@ -125,7 +135,7 @@ def simple_task(case): reports = experiment.run_evaluations(simple_task) # Verify LLM evaluator was called for each test case - assert mock_agent.structured_output.call_count == 3 + assert mock_agent.call_count == 3 assert len(reports) == 1 report = reports[0] assert len(report.scores) == 3 @@ -317,7 +327,7 @@ def task_with_interactions(case): reports = experiment.run_evaluations(task_with_interactions) # Verify the evaluator was called (once per interaction, so 2 times) - assert mock_agent.structured_output.call_count == 2 + assert mock_agent.call_count == 2 assert len(reports) == 1 report = reports[0] assert len(report.scores) == 1