Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 21 additions & 16 deletions src/strands_evals/evaluators/faithfulness_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import Enum
from typing import cast

from pydantic import BaseModel, Field
from strands import Agent
Expand Down Expand Up @@ -59,29 +60,33 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = evaluator_agent.structured_output(FaithfulnessRating, prompt)
result = evaluator_agent(prompt, structured_output_model=FaithfulnessRating)
rating = cast(FaithfulnessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 0.5,
reason=rating.reasoning,
label=rating.score,
)
return [result]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 0.5,
reason=rating.reasoning,
label=rating.score,
)
]

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = await evaluator_agent.structured_output_async(FaithfulnessRating, prompt)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=FaithfulnessRating)
rating = cast(FaithfulnessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 0.5,
reason=rating.reasoning,
label=rating.score,
)
return [result]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 0.5,
reason=rating.reasoning,
label=rating.score,
)
]

def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
Expand Down
37 changes: 21 additions & 16 deletions src/strands_evals/evaluators/goal_success_rate_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import Enum
from typing import cast

from pydantic import BaseModel, Field
from strands import Agent
Expand Down Expand Up @@ -53,29 +54,33 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
session_input = self._parse_trajectory(evaluation_case)
prompt = self._format_prompt(session_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = evaluator_agent.structured_output(GoalSuccessRating, prompt)
result = evaluator_agent(prompt, structured_output_model=GoalSuccessRating)
rating = cast(GoalSuccessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 1.0,
reason=rating.reasoning,
label=rating.score,
)
return [result]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 1.0,
reason=rating.reasoning,
label=rating.score,
)
]

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
session_input = self._parse_trajectory(evaluation_case)
prompt = self._format_prompt(session_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=GoalSuccessRating)
rating = cast(GoalSuccessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 1.0,
reason=rating.reasoning,
label=rating.score,
)
return [result]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 1.0,
reason=rating.reasoning,
label=rating.score,
)
]

def _format_prompt(self, session_input: SessionLevelInput) -> str:
"""Format evaluation prompt from session-level input."""
Expand Down
37 changes: 21 additions & 16 deletions src/strands_evals/evaluators/harmfulness_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import Enum
from typing import cast

from pydantic import BaseModel, Field
from strands import Agent
Expand Down Expand Up @@ -52,29 +53,33 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = evaluator_agent.structured_output(HarmfulnessRating, prompt)
result = evaluator_agent(prompt, structured_output_model=HarmfulnessRating)
rating = cast(HarmfulnessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(
score=normalized_score,
test_pass=normalized_score == 1.0,
reason=rating.reasoning,
label=rating.score,
)
return [result]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score == 1.0,
reason=rating.reasoning,
label=rating.score,
)
]

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = await evaluator_agent.structured_output_async(HarmfulnessRating, prompt)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=HarmfulnessRating)
rating = cast(HarmfulnessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(
score=normalized_score,
test_pass=normalized_score == 1.0,
reason=rating.reasoning,
label=rating.score,
)
return [result]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score == 1.0,
reason=rating.reasoning,
label=rating.score,
)
]

def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
Expand Down
37 changes: 21 additions & 16 deletions src/strands_evals/evaluators/helpfulness_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import Enum
from typing import cast

from pydantic import BaseModel, Field
from strands import Agent
Expand Down Expand Up @@ -65,29 +66,33 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = evaluator_agent.structured_output(HelpfulnessRating, prompt)
result = evaluator_agent(prompt, structured_output_model=HelpfulnessRating)
rating = cast(HelpfulnessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 0.5,
reason=rating.reasoning,
label=rating.score,
)
return [result]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 0.5,
reason=rating.reasoning,
label=rating.score,
)
]

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = await evaluator_agent.structured_output_async(HelpfulnessRating, prompt)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=HelpfulnessRating)
rating = cast(HelpfulnessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 0.5,
reason=rating.reasoning,
label=rating.score,
)
return [result]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 0.5,
reason=rating.reasoning,
label=rating.score,
)
]

def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
Expand Down
10 changes: 6 additions & 4 deletions src/strands_evals/evaluators/interactions_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import cast

from strands import Agent
from strands.agent.conversation_manager import SlidingWindowConversationManager
from strands.models.model import Model
Expand Down Expand Up @@ -198,8 +200,8 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
for i in range(num_interactions):
is_last = i == num_interactions - 1
evaluation_prompt = self._compose_prompt(evaluation_case, i, is_last)
result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
results.append(result)
result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
results.append(cast(EvaluationOutput, result.structured_output))

return results

Expand Down Expand Up @@ -238,7 +240,7 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
for i in range(num_interactions):
is_last = i == num_interactions - 1
evaluation_prompt = self._compose_prompt(evaluation_case, i, is_last)
result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
results.append(result)
result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
results.append(cast(EvaluationOutput, result.structured_output))

return results
10 changes: 6 additions & 4 deletions src/strands_evals/evaluators/output_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import cast

from strands import Agent
from strands.models.model import Model
from typing_extensions import TypeVar, Union
Expand Down Expand Up @@ -51,8 +53,8 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
evaluation_prompt = compose_test_prompt(
evaluation_case=evaluation_case, rubric=self.rubric, include_inputs=self.include_inputs
)
result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
return [result]
result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
return [cast(EvaluationOutput, result.structured_output)]

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
"""
Expand All @@ -68,5 +70,5 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
evaluation_prompt = compose_test_prompt(
evaluation_case=evaluation_case, rubric=self.rubric, include_inputs=self.include_inputs
)
result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
return [result]
result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
return [cast(EvaluationOutput, result.structured_output)]
27 changes: 19 additions & 8 deletions src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import Enum
from typing import cast

from pydantic import BaseModel, Field
from strands import Agent
Expand Down Expand Up @@ -56,12 +57,17 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
for tool_input in tool_inputs:
prompt = self._format_prompt(tool_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = evaluator_agent.structured_output(ToolParameterAccuracyRating, prompt)
result = evaluator_agent(prompt, structured_output_model=ToolParameterAccuracyRating)
rating = cast(ToolParameterAccuracyRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(
score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score
results.append(
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score == 1.0,
reason=rating.reasoning,
label=rating.score,
)
)
results.append(result)

return results

Expand All @@ -72,12 +78,17 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
for tool_input in tool_inputs:
prompt = self._format_prompt(tool_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = await evaluator_agent.structured_output_async(ToolParameterAccuracyRating, prompt)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=ToolParameterAccuracyRating)
rating = cast(ToolParameterAccuracyRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(
score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score
results.append(
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score == 1.0,
reason=rating.reasoning,
label=rating.score,
)
)
results.append(result)

return results

Expand Down
27 changes: 19 additions & 8 deletions src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import Enum
from typing import cast

from pydantic import BaseModel, Field
from strands import Agent
Expand Down Expand Up @@ -56,12 +57,17 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
for tool_input in tool_inputs:
prompt = self._format_prompt(tool_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = evaluator_agent.structured_output(ToolSelectionRating, prompt)
result = evaluator_agent(prompt, structured_output_model=ToolSelectionRating)
rating = cast(ToolSelectionRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(
score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score
results.append(
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score == 1.0,
reason=rating.reasoning,
label=rating.score,
)
)
results.append(result)

return results

Expand All @@ -72,12 +78,17 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
for tool_input in tool_inputs:
prompt = self._format_prompt(tool_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = await evaluator_agent.structured_output_async(ToolSelectionRating, prompt)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=ToolSelectionRating)
rating = cast(ToolSelectionRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(
score=normalized_score, test_pass=normalized_score == 1.0, reason=rating.reasoning, label=rating.score
results.append(
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score == 1.0,
reason=rating.reasoning,
label=rating.score,
)
)
results.append(result)

return results

Expand Down
Loading