diff --git a/src/strands_evals/evaluators/faithfulness_evaluator.py b/src/strands_evals/evaluators/faithfulness_evaluator.py
index bf79b23..8a5c9a2 100644
--- a/src/strands_evals/evaluators/faithfulness_evaluator.py
+++ b/src/strands_evals/evaluators/faithfulness_evaluator.py
@@ -1,4 +1,5 @@
from enum import Enum
+from typing import cast
from pydantic import BaseModel, Field
from strands import Agent
@@ -59,29 +60,33 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
- rating = evaluator_agent.structured_output(FaithfulnessRating, prompt)
+ result = evaluator_agent(prompt, structured_output_model=FaithfulnessRating)
+ rating = cast(FaithfulnessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
- result = EvaluationOutput(
- score=normalized_score,
- test_pass=normalized_score >= 0.5,
- reason=rating.reasoning,
- label=rating.score,
- )
- return [result]
+ return [
+ EvaluationOutput(
+ score=normalized_score,
+ test_pass=normalized_score >= 0.5,
+ reason=rating.reasoning,
+ label=rating.score,
+ )
+ ]
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
- rating = await evaluator_agent.structured_output_async(FaithfulnessRating, prompt)
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=FaithfulnessRating)
+ rating = cast(FaithfulnessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
- result = EvaluationOutput(
- score=normalized_score,
- test_pass=normalized_score >= 0.5,
- reason=rating.reasoning,
- label=rating.score,
- )
- return [result]
+ return [
+ EvaluationOutput(
+ score=normalized_score,
+ test_pass=normalized_score >= 0.5,
+ reason=rating.reasoning,
+ label=rating.score,
+ )
+ ]
def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
diff --git a/src/strands_evals/evaluators/goal_success_rate_evaluator.py b/src/strands_evals/evaluators/goal_success_rate_evaluator.py
index 93a3bf5..85896bb 100644
--- a/src/strands_evals/evaluators/goal_success_rate_evaluator.py
+++ b/src/strands_evals/evaluators/goal_success_rate_evaluator.py
@@ -1,4 +1,5 @@
from enum import Enum
+from typing import cast
from pydantic import BaseModel, Field
from strands import Agent
@@ -53,29 +54,33 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
session_input = self._parse_trajectory(evaluation_case)
prompt = self._format_prompt(session_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
- rating = evaluator_agent.structured_output(GoalSuccessRating, prompt)
+ result = evaluator_agent(prompt, structured_output_model=GoalSuccessRating)
+ rating = cast(GoalSuccessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
- result = EvaluationOutput(
- score=normalized_score,
- test_pass=normalized_score >= 1.0,
- reason=rating.reasoning,
- label=rating.score,
- )
- return [result]
+ return [
+ EvaluationOutput(
+ score=normalized_score,
+ test_pass=normalized_score >= 1.0,
+ reason=rating.reasoning,
+ label=rating.score,
+ )
+ ]
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
session_input = self._parse_trajectory(evaluation_case)
prompt = self._format_prompt(session_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
- rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt)
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=GoalSuccessRating)
+ rating = cast(GoalSuccessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
- result = EvaluationOutput(
- score=normalized_score,
- test_pass=normalized_score >= 1.0,
- reason=rating.reasoning,
- label=rating.score,
- )
- return [result]
+ return [
+ EvaluationOutput(
+ score=normalized_score,
+ test_pass=normalized_score >= 1.0,
+ reason=rating.reasoning,
+ label=rating.score,
+ )
+ ]
def _format_prompt(self, session_input: SessionLevelInput) -> str:
"""Format evaluation prompt from session-level input."""
diff --git a/src/strands_evals/evaluators/harmfulness_evaluator.py b/src/strands_evals/evaluators/harmfulness_evaluator.py
index 07c7281..3719f2a 100644
--- a/src/strands_evals/evaluators/harmfulness_evaluator.py
+++ b/src/strands_evals/evaluators/harmfulness_evaluator.py
@@ -1,4 +1,5 @@
from enum import Enum
+from typing import cast
from pydantic import BaseModel, Field
from strands import Agent
@@ -52,29 +53,33 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
- rating = evaluator_agent.structured_output(HarmfulnessRating, prompt)
+ result = evaluator_agent(prompt, structured_output_model=HarmfulnessRating)
+ rating = cast(HarmfulnessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
- result = EvaluationOutput(
- score=normalized_score,
- test_pass=normalized_score == 1.0,
- reason=rating.reasoning,
- label=rating.score,
- )
- return [result]
+ return [
+ EvaluationOutput(
+ score=normalized_score,
+ test_pass=normalized_score == 1.0,
+ reason=rating.reasoning,
+ label=rating.score,
+ )
+ ]
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
- rating = await evaluator_agent.structured_output_async(HarmfulnessRating, prompt)
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=HarmfulnessRating)
+ rating = cast(HarmfulnessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
- result = EvaluationOutput(
- score=normalized_score,
- test_pass=normalized_score == 1.0,
- reason=rating.reasoning,
- label=rating.score,
- )
- return [result]
+ return [
+ EvaluationOutput(
+ score=normalized_score,
+ test_pass=normalized_score == 1.0,
+ reason=rating.reasoning,
+ label=rating.score,
+ )
+ ]
def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
diff --git a/src/strands_evals/evaluators/helpfulness_evaluator.py b/src/strands_evals/evaluators/helpfulness_evaluator.py
index c2e893a..2fb487b 100644
--- a/src/strands_evals/evaluators/helpfulness_evaluator.py
+++ b/src/strands_evals/evaluators/helpfulness_evaluator.py
@@ -1,4 +1,5 @@
from enum import Enum
+from typing import cast
from pydantic import BaseModel, Field
from strands import Agent
@@ -65,29 +66,33 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
- rating = evaluator_agent.structured_output(HelpfulnessRating, prompt)
+ result = evaluator_agent(prompt, structured_output_model=HelpfulnessRating)
+ rating = cast(HelpfulnessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
- result = EvaluationOutput(
- score=normalized_score,
- test_pass=normalized_score >= 0.5,
- reason=rating.reasoning,
- label=rating.score,
- )
- return [result]
+ return [
+ EvaluationOutput(
+ score=normalized_score,
+ test_pass=normalized_score >= 0.5,
+ reason=rating.reasoning,
+ label=rating.score,
+ )
+ ]
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
- rating = await evaluator_agent.structured_output_async(HelpfulnessRating, prompt)
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=HelpfulnessRating)
+ rating = cast(HelpfulnessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
- result = EvaluationOutput(
- score=normalized_score,
- test_pass=normalized_score >= 0.5,
- reason=rating.reasoning,
- label=rating.score,
- )
- return [result]
+ return [
+ EvaluationOutput(
+ score=normalized_score,
+ test_pass=normalized_score >= 0.5,
+ reason=rating.reasoning,
+ label=rating.score,
+ )
+ ]
def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
diff --git a/src/strands_evals/evaluators/interactions_evaluator.py b/src/strands_evals/evaluators/interactions_evaluator.py
index ba2b59a..76f83f1 100644
--- a/src/strands_evals/evaluators/interactions_evaluator.py
+++ b/src/strands_evals/evaluators/interactions_evaluator.py
@@ -1,3 +1,5 @@
+from typing import cast
+
from strands import Agent
from strands.agent.conversation_manager import SlidingWindowConversationManager
from strands.models.model import Model
@@ -198,8 +200,8 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
for i in range(num_interactions):
is_last = i == num_interactions - 1
evaluation_prompt = self._compose_prompt(evaluation_case, i, is_last)
- result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
- results.append(result)
+ result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
+ results.append(cast(EvaluationOutput, result.structured_output))
return results
@@ -238,7 +240,7 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
for i in range(num_interactions):
is_last = i == num_interactions - 1
evaluation_prompt = self._compose_prompt(evaluation_case, i, is_last)
- result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
- results.append(result)
+ result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
+ results.append(cast(EvaluationOutput, result.structured_output))
return results
diff --git a/src/strands_evals/evaluators/output_evaluator.py b/src/strands_evals/evaluators/output_evaluator.py
index 34a60ca..1da9341 100644
--- a/src/strands_evals/evaluators/output_evaluator.py
+++ b/src/strands_evals/evaluators/output_evaluator.py
@@ -1,3 +1,5 @@
+from typing import cast
+
from strands import Agent
from strands.models.model import Model
from typing_extensions import TypeVar, Union
@@ -51,8 +53,8 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
evaluation_prompt = compose_test_prompt(
evaluation_case=evaluation_case, rubric=self.rubric, include_inputs=self.include_inputs
)
- result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
- return [result]
+ result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
+ return [cast(EvaluationOutput, result.structured_output)]
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
"""
@@ -68,5 +70,5 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
evaluation_prompt = compose_test_prompt(
evaluation_case=evaluation_case, rubric=self.rubric, include_inputs=self.include_inputs
)
- result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
- return [result]
+ result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
+ return [cast(EvaluationOutput, result.structured_output)]
diff --git a/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py b/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py
index 7a1b938..63e8ac8 100644
--- a/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py
+++ b/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py
@@ -1,4 +1,5 @@
from enum import Enum
+from typing import cast
from pydantic import BaseModel, Field
from strands import Agent
@@ -56,12 +57,17 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
for tool_input in tool_inputs:
prompt = self._format_prompt(tool_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
- rating = evaluator_agent.structured_output(ToolParameterAccuracyRating, prompt)
+ result = evaluator_agent(prompt, structured_output_model=ToolParameterAccuracyRating)
+ rating = cast(ToolParameterAccuracyRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
- result = EvaluationOutput(
- score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score
+ results.append(
+ EvaluationOutput(
+ score=normalized_score,
+ test_pass=normalized_score == 1.0,
+ reason=rating.reasoning,
+ label=rating.score,
+ )
)
- results.append(result)
return results
@@ -72,12 +78,17 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
for tool_input in tool_inputs:
prompt = self._format_prompt(tool_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
- rating = await evaluator_agent.structured_output_async(ToolParameterAccuracyRating, prompt)
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=ToolParameterAccuracyRating)
+ rating = cast(ToolParameterAccuracyRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
- result = EvaluationOutput(
- score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score
+ results.append(
+ EvaluationOutput(
+ score=normalized_score,
+ test_pass=normalized_score == 1.0,
+ reason=rating.reasoning,
+ label=rating.score,
+ )
)
- results.append(result)
return results
diff --git a/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py b/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py
index 95807a8..3904562 100644
--- a/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py
+++ b/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py
@@ -1,4 +1,5 @@
from enum import Enum
+from typing import cast
from pydantic import BaseModel, Field
from strands import Agent
@@ -56,12 +57,17 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
for tool_input in tool_inputs:
prompt = self._format_prompt(tool_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
- rating = evaluator_agent.structured_output(ToolSelectionRating, prompt)
+ result = evaluator_agent(prompt, structured_output_model=ToolSelectionRating)
+ rating = cast(ToolSelectionRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
- result = EvaluationOutput(
- score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score
+ results.append(
+ EvaluationOutput(
+ score=normalized_score,
+ test_pass=normalized_score == 1.0,
+ reason=rating.reasoning,
+ label=rating.score,
+ )
)
- results.append(result)
return results
@@ -72,12 +78,17 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
for tool_input in tool_inputs:
prompt = self._format_prompt(tool_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
- rating = await evaluator_agent.structured_output_async(ToolSelectionRating, prompt)
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=ToolSelectionRating)
+ rating = cast(ToolSelectionRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
- result = EvaluationOutput(
- score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score
+ results.append(
+ EvaluationOutput(
+ score=normalized_score,
+ test_pass=normalized_score == 1.0,
+ reason=rating.reasoning,
+ label=rating.score,
+ )
)
- results.append(result)
return results
diff --git a/src/strands_evals/evaluators/trajectory_evaluator.py b/src/strands_evals/evaluators/trajectory_evaluator.py
index 76d47f7..049c85b 100644
--- a/src/strands_evals/evaluators/trajectory_evaluator.py
+++ b/src/strands_evals/evaluators/trajectory_evaluator.py
@@ -1,3 +1,5 @@
+from typing import cast
+
from strands import Agent
from strands.models.model import Model
from typing_extensions import Any, TypeVar, Union
@@ -74,8 +76,8 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
include_inputs=self.include_inputs,
uses_trajectory=True,
)
- result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
- return [result]
+ result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
+ return [cast(EvaluationOutput, result.structured_output)]
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
"""
@@ -96,5 +98,5 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
include_inputs=self.include_inputs,
uses_trajectory=True,
)
- result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
- return [result]
+ result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
+ return [cast(EvaluationOutput, result.structured_output)]
diff --git a/tests/strands_evals/evaluators/test_faithfulness_evaluator.py b/tests/strands_evals/evaluators/test_faithfulness_evaluator.py
index 85465ad..90b656a 100644
--- a/tests/strands_evals/evaluators/test_faithfulness_evaluator.py
+++ b/tests/strands_evals/evaluators/test_faithfulness_evaluator.py
@@ -50,9 +50,11 @@ def test_init_with_custom_values():
@patch("strands_evals.evaluators.faithfulness_evaluator.Agent")
def test_evaluate(mock_agent_class, evaluation_data):
mock_agent = Mock()
- mock_agent.structured_output.return_value = FaithfulnessRating(
+ mock_result = Mock()
+ mock_result.structured_output = FaithfulnessRating(
reasoning="The response is faithful", score=FaithfulnessScore.COMPLETELY_YES
)
+ mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = FaithfulnessEvaluator()
@@ -78,7 +80,9 @@ def test_evaluate(mock_agent_class, evaluation_data):
@patch("strands_evals.evaluators.faithfulness_evaluator.Agent")
def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass):
mock_agent = Mock()
- mock_agent.structured_output.return_value = FaithfulnessRating(reasoning="Test", score=score)
+ mock_result = Mock()
+ mock_result.structured_output = FaithfulnessRating(reasoning="Test", score=score)
+ mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = FaithfulnessEvaluator()
@@ -95,10 +99,14 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value,
async def test_evaluate_async(mock_agent_class, evaluation_data):
mock_agent = Mock()
- async def mock_structured_output_async(*args, **kwargs):
- return FaithfulnessRating(reasoning="The response is faithful", score=FaithfulnessScore.COMPLETELY_YES)
+ async def mock_invoke_async(*args, **kwargs):
+ mock_result = Mock()
+ mock_result.structured_output = FaithfulnessRating(
+ reasoning="The response is faithful", score=FaithfulnessScore.COMPLETELY_YES
+ )
+ return mock_result
- mock_agent.structured_output_async = mock_structured_output_async
+ mock_agent.invoke_async = mock_invoke_async
mock_agent_class.return_value = mock_agent
evaluator = FaithfulnessEvaluator()
diff --git a/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py b/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py
index 53c0829..86dce93 100644
--- a/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py
+++ b/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py
@@ -67,9 +67,9 @@ def test_init_with_custom_values():
@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
def test_evaluate(mock_agent_class, evaluation_data):
mock_agent = Mock()
- mock_agent.structured_output.return_value = GoalSuccessRating(
- reasoning="All goals achieved", score=GoalSuccessScore.YES
- )
+ mock_result = Mock()
+ mock_result.structured_output = GoalSuccessRating(reasoning="All goals achieved", score=GoalSuccessScore.YES)
+ mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = GoalSuccessRateEvaluator()
@@ -92,7 +92,9 @@ def test_evaluate(mock_agent_class, evaluation_data):
@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass):
mock_agent = Mock()
- mock_agent.structured_output.return_value = GoalSuccessRating(reasoning="Test", score=score)
+ mock_result = Mock()
+ mock_result.structured_output = GoalSuccessRating(reasoning="Test", score=score)
+ mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = GoalSuccessRateEvaluator()
@@ -109,10 +111,12 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value,
async def test_evaluate_async(mock_agent_class, evaluation_data):
mock_agent = Mock()
- async def mock_structured_output_async(*args, **kwargs):
- return GoalSuccessRating(reasoning="All goals achieved", score=GoalSuccessScore.YES)
+ async def mock_invoke_async(*args, **kwargs):
+ mock_result = Mock()
+ mock_result.structured_output = GoalSuccessRating(reasoning="All goals achieved", score=GoalSuccessScore.YES)
+ return mock_result
- mock_agent.structured_output_async = mock_structured_output_async
+ mock_agent.invoke_async = mock_invoke_async
mock_agent_class.return_value = mock_agent
evaluator = GoalSuccessRateEvaluator()
diff --git a/tests/strands_evals/evaluators/test_harmfulness_evaluator.py b/tests/strands_evals/evaluators/test_harmfulness_evaluator.py
index 3ea1199..582a36f 100644
--- a/tests/strands_evals/evaluators/test_harmfulness_evaluator.py
+++ b/tests/strands_evals/evaluators/test_harmfulness_evaluator.py
@@ -56,9 +56,11 @@ def test_init_with_custom_values():
@patch("strands_evals.evaluators.harmfulness_evaluator.Agent")
def test_evaluate(mock_agent_class, evaluation_data):
mock_agent = Mock()
- mock_agent.structured_output.return_value = HarmfulnessRating(
+ mock_result = Mock()
+ mock_result.structured_output = HarmfulnessRating(
reasoning="The response is safe and helpful", score=HarmfulnessScore.NOT_HARMFUL
)
+ mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = HarmfulnessEvaluator()
@@ -81,7 +83,9 @@ def test_evaluate(mock_agent_class, evaluation_data):
@patch("strands_evals.evaluators.harmfulness_evaluator.Agent")
def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass):
mock_agent = Mock()
- mock_agent.structured_output.return_value = HarmfulnessRating(reasoning="Test", score=score)
+ mock_result = Mock()
+ mock_result.structured_output = HarmfulnessRating(reasoning="Test", score=score)
+ mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = HarmfulnessEvaluator()
@@ -98,10 +102,14 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value,
async def test_evaluate_async(mock_agent_class, evaluation_data):
mock_agent = Mock()
- async def mock_structured_output_async(*args, **kwargs):
- return HarmfulnessRating(reasoning="The response is safe and helpful", score=HarmfulnessScore.NOT_HARMFUL)
+ async def mock_invoke_async(*args, **kwargs):
+ mock_result = Mock()
+ mock_result.structured_output = HarmfulnessRating(
+ reasoning="The response is safe and helpful", score=HarmfulnessScore.NOT_HARMFUL
+ )
+ return mock_result
- mock_agent.structured_output_async = mock_structured_output_async
+ mock_agent.invoke_async = mock_invoke_async
mock_agent_class.return_value = mock_agent
evaluator = HarmfulnessEvaluator()
diff --git a/tests/strands_evals/evaluators/test_helpfulness_evaluator.py b/tests/strands_evals/evaluators/test_helpfulness_evaluator.py
index e195d8a..40a8cf6 100644
--- a/tests/strands_evals/evaluators/test_helpfulness_evaluator.py
+++ b/tests/strands_evals/evaluators/test_helpfulness_evaluator.py
@@ -52,9 +52,11 @@ def test_init_with_custom_values():
@patch("strands_evals.evaluators.helpfulness_evaluator.Agent")
def test_evaluate(mock_agent_class, evaluation_data):
mock_agent = Mock()
- mock_agent.structured_output.return_value = HelpfulnessRating(
+ mock_result = Mock()
+ mock_result.structured_output = HelpfulnessRating(
reasoning="The response is helpful", score=HelpfulnessScore.VERY_HELPFUL
)
+ mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = HelpfulnessEvaluator()
@@ -82,7 +84,9 @@ def test_evaluate(mock_agent_class, evaluation_data):
@patch("strands_evals.evaluators.helpfulness_evaluator.Agent")
def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass):
mock_agent = Mock()
- mock_agent.structured_output.return_value = HelpfulnessRating(reasoning="Test", score=score)
+ mock_result = Mock()
+ mock_result.structured_output = HelpfulnessRating(reasoning="Test", score=score)
+ mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = HelpfulnessEvaluator()
@@ -99,10 +103,14 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value,
async def test_evaluate_async(mock_agent_class, evaluation_data):
mock_agent = Mock()
- async def mock_structured_output_async(*args, **kwargs):
- return HelpfulnessRating(reasoning="The response is helpful", score=HelpfulnessScore.VERY_HELPFUL)
+ async def mock_invoke_async(*args, **kwargs):
+ mock_result = Mock()
+ mock_result.structured_output = HelpfulnessRating(
+ reasoning="The response is helpful", score=HelpfulnessScore.VERY_HELPFUL
+ )
+ return mock_result
- mock_agent.structured_output_async = mock_structured_output_async
+ mock_agent.invoke_async = mock_invoke_async
mock_agent_class.return_value = mock_agent
evaluator = HelpfulnessEvaluator()
diff --git a/tests/strands_evals/evaluators/test_interactions_evaluator.py b/tests/strands_evals/evaluators/test_interactions_evaluator.py
index 82ab323..a942ed9 100644
--- a/tests/strands_evals/evaluators/test_interactions_evaluator.py
+++ b/tests/strands_evals/evaluators/test_interactions_evaluator.py
@@ -10,9 +10,9 @@
def mock_agent():
"""Mock Agent for testing"""
agent = Mock()
- agent.structured_output.return_value = EvaluationOutput(
- score=0.85, test_pass=True, reason="Mock interaction evaluation"
- )
+ mock_result = Mock()
+ mock_result.structured_output = EvaluationOutput(score=0.85, test_pass=True, reason="Mock interaction evaluation")
+ agent.return_value = mock_result
return agent
@@ -21,11 +21,14 @@ def mock_async_agent():
"""Mock Agent for testing with async"""
agent = Mock()
- # Create a mock coroutine function
- async def mock_structured_output_async(*args, **kwargs):
- return EvaluationOutput(score=0.85, test_pass=True, reason="Mock async interaction evaluation")
+ async def mock_invoke_async(*args, **kwargs):
+ mock_result = Mock()
+ mock_result.structured_output = EvaluationOutput(
+ score=0.85, test_pass=True, reason="Mock async interaction evaluation"
+ )
+ return mock_result
- agent.structured_output_async = mock_structured_output_async
+ agent.invoke_async = mock_invoke_async
return agent
@@ -118,7 +121,7 @@ def test_interactions_evaluator_evaluate_with_full_data(mock_agent_class, evalua
assert call_kwargs["callback_handler"] is None
# Verify structured_output was called twice (for each interaction)
- assert mock_agent.structured_output.call_count == 2
+ assert mock_agent.call_count == 2
assert result[0].score == 0.85
assert result[0].test_pass is True
@@ -132,13 +135,13 @@ def test_interactions_evaluator_evaluate_without_inputs(mock_agent_class, evalua
result = evaluator.evaluate(evaluation_data)
- # Check that structured_output was called
- assert mock_agent.structured_output.call_count == 2
+ # Check that agent was called
+ assert mock_agent.call_count == 2
assert result[0].score == 0.85
assert result[0].test_pass is True
- call_args = mock_agent.structured_output.call_args
- prompt = call_args[0][1]
+ call_args = mock_agent.call_args
+ prompt = call_args[0][0]
assert "" not in prompt
assert "" not in prompt
@@ -202,12 +205,12 @@ def test_interactions_evaluator_evaluate_with_dict_rubric(mock_agent_class, mock
result = evaluator.evaluate(evaluation_data)
# Verify both interactions were evaluated
- assert mock_agent.structured_output.call_count == 2
+ assert mock_agent.call_count == 2
# Check that correct rubrics were used in prompts
- call_args_list = mock_agent.structured_output.call_args_list
- first_prompt = call_args_list[0][0][1]
- second_prompt = call_args_list[1][0][1]
+ call_args_list = mock_agent.call_args_list
+ first_prompt = call_args_list[0][0][0]
+ second_prompt = call_args_list[1][0][0]
assert "Evaluate planning quality and task breakdown" in first_prompt
assert "Assess research thoroughness and accuracy" in second_prompt
diff --git a/tests/strands_evals/evaluators/test_output_evaluator.py b/tests/strands_evals/evaluators/test_output_evaluator.py
index e7a346b..079f305 100644
--- a/tests/strands_evals/evaluators/test_output_evaluator.py
+++ b/tests/strands_evals/evaluators/test_output_evaluator.py
@@ -10,7 +10,9 @@
def mock_agent():
"""Mock Agent for testing"""
agent = Mock()
- agent.structured_output.return_value = EvaluationOutput(score=0.8, test_pass=True, reason="Mock evaluation result")
+ mock_result = Mock()
+ mock_result.structured_output = EvaluationOutput(score=0.8, test_pass=True, reason="Mock evaluation result")
+ agent.return_value = mock_result
return agent
@@ -20,10 +22,14 @@ def mock_async_agent():
agent = Mock()
# Create a mock coroutine function
- async def mock_structured_output_async(*args, **kwargs):
- return EvaluationOutput(score=0.8, test_pass=True, reason="Mock async evaluation result")
-
- agent.structured_output_async = mock_structured_output_async
+ async def mock_invoke_async(*args, **kwargs):
+ mock_result = Mock()
+ mock_result.structured_output = EvaluationOutput(
+ score=0.8, test_pass=True, reason="Mock async evaluation result"
+ )
+ return mock_result
+
+ agent.invoke_async = mock_invoke_async
return agent
@@ -66,11 +72,11 @@ def test_output_evaluator_evaluate_with_inputs(mock_agent_class, evaluation_data
# Verify Agent was created with correct parameters
mock_agent_class.assert_called_once_with(model=None, system_prompt=evaluator.system_prompt, callback_handler=None)
- # Verify structured_output was called
- mock_agent.structured_output.assert_called_once()
- call_args = mock_agent.structured_output.call_args
- assert call_args[0][0] == EvaluationOutput
- prompt = call_args[0][1]
+ # Verify agent was called
+ mock_agent.assert_called_once()
+ call_args = mock_agent.call_args
+ prompt = call_args[0][0]
+ assert call_args[1]["structured_output_model"] == EvaluationOutput
assert "What is 2+2?" in prompt
assert "" not in prompt
assert "" not in prompt
@@ -91,8 +97,8 @@ def test_output_evaluator_evaluate_without_inputs(mock_agent_class, evaluation_d
result = evaluator.evaluate(evaluation_data)
- call_args = mock_agent.structured_output.call_args
- prompt = call_args[0][1]
+ call_args = mock_agent.call_args
+ prompt = call_args[0][0]
assert "" not in prompt
assert "" not in prompt
assert "" not in prompt
@@ -117,8 +123,8 @@ def test_output_evaluator_evaluate_without_expected_output(mock_agent_class, moc
evaluator.evaluate(evaluation_data)
- call_args = mock_agent.structured_output.call_args
- prompt = call_args[0][1]
+ call_args = mock_agent.call_args
+ prompt = call_args[0][0]
assert "" not in prompt
assert "" in prompt
diff --git a/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py b/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py
index 870aaa2..7e2debb 100644
--- a/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py
+++ b/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py
@@ -77,9 +77,11 @@ def test_init_with_custom_values():
@patch("strands_evals.evaluators.tool_parameter_accuracy_evaluator.Agent")
def test_evaluate(mock_agent_class, evaluation_data):
mock_agent = Mock()
- mock_agent.structured_output.return_value = ToolParameterAccuracyRating(
+ mock_result = Mock()
+ mock_result.structured_output = ToolParameterAccuracyRating(
reasoning="Parameters are faithful to context", score=ToolParameterAccuracyScore.YES
)
+ mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = ToolParameterAccuracyEvaluator()
@@ -102,7 +104,9 @@ def test_evaluate(mock_agent_class, evaluation_data):
@patch("strands_evals.evaluators.tool_parameter_accuracy_evaluator.Agent")
def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass):
mock_agent = Mock()
- mock_agent.structured_output.return_value = ToolParameterAccuracyRating(reasoning="Test", score=score)
+ mock_result = Mock()
+ mock_result.structured_output = ToolParameterAccuracyRating(reasoning="Test", score=score)
+ mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = ToolParameterAccuracyEvaluator()
@@ -119,12 +123,14 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value,
async def test_evaluate_async(mock_agent_class, evaluation_data):
mock_agent = Mock()
- async def mock_structured_output_async(*args, **kwargs):
- return ToolParameterAccuracyRating(
+ async def mock_invoke_async(*args, **kwargs):
+ mock_result = Mock()
+ mock_result.structured_output = ToolParameterAccuracyRating(
reasoning="Parameters are faithful to context", score=ToolParameterAccuracyScore.YES
)
+ return mock_result
- mock_agent.structured_output_async = mock_structured_output_async
+ mock_agent.invoke_async = mock_invoke_async
mock_agent_class.return_value = mock_agent
evaluator = ToolParameterAccuracyEvaluator()
diff --git a/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py b/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py
index 9a5243b..5955929 100644
--- a/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py
+++ b/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py
@@ -69,9 +69,11 @@ def test_init_with_custom_values():
@patch("strands_evals.evaluators.tool_selection_accuracy_evaluator.Agent")
def test_evaluate(mock_agent_class, evaluation_data):
mock_agent = Mock()
- mock_agent.structured_output.return_value = ToolSelectionRating(
+ mock_result = Mock()
+ mock_result.structured_output = ToolSelectionRating(
reasoning="Tool call is appropriate", score=ToolSelectionScore.YES
)
+ mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = ToolSelectionAccuracyEvaluator()
@@ -94,7 +96,9 @@ def test_evaluate(mock_agent_class, evaluation_data):
@patch("strands_evals.evaluators.tool_selection_accuracy_evaluator.Agent")
def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass):
mock_agent = Mock()
- mock_agent.structured_output.return_value = ToolSelectionRating(reasoning="Test", score=score)
+ mock_result = Mock()
+ mock_result.structured_output = ToolSelectionRating(reasoning="Test", score=score)
+ mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = ToolSelectionAccuracyEvaluator()
@@ -111,10 +115,14 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value,
async def test_evaluate_async(mock_agent_class, evaluation_data):
mock_agent = Mock()
- async def mock_structured_output_async(*args, **kwargs):
- return ToolSelectionRating(reasoning="Tool call is appropriate", score=ToolSelectionScore.YES)
+ async def mock_invoke_async(*args, **kwargs):
+ mock_result = Mock()
+ mock_result.structured_output = ToolSelectionRating(
+ reasoning="Tool call is appropriate", score=ToolSelectionScore.YES
+ )
+ return mock_result
- mock_agent.structured_output_async = mock_structured_output_async
+ mock_agent.invoke_async = mock_invoke_async
mock_agent_class.return_value = mock_agent
evaluator = ToolSelectionAccuracyEvaluator()
diff --git a/tests/strands_evals/evaluators/test_trajectory_evaluator.py b/tests/strands_evals/evaluators/test_trajectory_evaluator.py
index 9d59f67..7adcd05 100644
--- a/tests/strands_evals/evaluators/test_trajectory_evaluator.py
+++ b/tests/strands_evals/evaluators/test_trajectory_evaluator.py
@@ -10,9 +10,9 @@
def mock_agent():
"""Mock Agent for testing"""
agent = Mock()
- agent.structured_output.return_value = EvaluationOutput(
- score=0.9, test_pass=True, reason="Mock trajectory evaluation"
- )
+ mock_result = Mock()
+ mock_result.structured_output = EvaluationOutput(score=0.9, test_pass=True, reason="Mock trajectory evaluation")
+ agent.return_value = mock_result
return agent
@@ -21,11 +21,14 @@ def mock_async_agent():
"""Mock Agent for testing with async"""
agent = Mock()
- # Create a mock coroutine function
- async def mock_structured_output_async(*args, **kwargs):
- return EvaluationOutput(score=0.9, test_pass=True, reason="Mock async trajectory evaluation")
+ async def mock_invoke_async(*args, **kwargs):
+ mock_result = Mock()
+ mock_result.structured_output = EvaluationOutput(
+ score=0.9, test_pass=True, reason="Mock async trajectory evaluation"
+ )
+ return mock_result
- agent.structured_output_async = mock_structured_output_async
+ agent.invoke_async = mock_invoke_async
return agent
@@ -81,10 +84,10 @@ def test_trajectory_evaluator_evaluate_with_full_data(mock_agent_class, evaluati
model=None, system_prompt=evaluator.system_prompt, tools=evaluator._tools, callback_handler=None
)
- # Verify structured_output call
- mock_agent.structured_output.assert_called_once()
- call_args = mock_agent.structured_output.call_args
- prompt = call_args[0][1]
+ # Verify agent call
+ mock_agent.assert_called_once()
+ call_args = mock_agent.call_args
+ prompt = call_args[0][0]
assert "What's 2x2?" in prompt
assert "