From 7b9c4bb05a685706a4a3463cfef8448879820545 Mon Sep 17 00:00:00 2001 From: Deepak Dalakoti Date: Tue, 25 Nov 2025 09:29:22 +1100 Subject: [PATCH 1/5] add langfuse trace mapper --- pyproject.toml | 1 + .../goal_success_rate_evaluator_langfuse.py | 40 + .../helpfulness_evaluator_langfuse.py | 42 + src/examples/langfuse_utils/trace.json | 915 ++++++++++++++++++ src/examples/langfuse_utils/utils.py | 62 ++ src/strands_evals/mappers/langfuse_mapper.py | 328 +++++++ .../mappers/test_langfuse_mapper.py | 179 ++++ 7 files changed, 1567 insertions(+) create mode 100644 src/examples/goal_success_rate_evaluator_langfuse.py create mode 100644 src/examples/helpfulness_evaluator_langfuse.py create mode 100644 src/examples/langfuse_utils/trace.json create mode 100644 src/examples/langfuse_utils/utils.py create mode 100644 src/strands_evals/mappers/langfuse_mapper.py create mode 100644 tests/strands_evals/mappers/test_langfuse_mapper.py diff --git a/pyproject.toml b/pyproject.toml index 2935d90..26fde8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ authors = [ ] dependencies = [ + "langfuse>=3.9.0,<4.0.0", "pydantic>=2.0.0,<3.0.0", "rich>=14.0.0,<15.0.0", "strands-agents>=1.0.0", diff --git a/src/examples/goal_success_rate_evaluator_langfuse.py b/src/examples/goal_success_rate_evaluator_langfuse.py new file mode 100644 index 0000000..2c2aca5 --- /dev/null +++ b/src/examples/goal_success_rate_evaluator_langfuse.py @@ -0,0 +1,40 @@ +from langfuse_utils.utils import get_langfuse_trace + +from strands_evals import Case, Dataset +from strands_evals.evaluators import GoalSuccessRateEvaluator +from strands_evals.mappers.langfuse_mapper import LangfuseSessionMapper + +# ====================================== +# SETUP AND RUN STRANDS EVAL +# ====================================== + + +# 1. Define a task function +def user_task_function(case: Case) -> dict: + mapper = LangfuseSessionMapper() + trace_id = case.metadata.get("trace_id", "") if case.metadata else "" + langfuse_traces = get_langfuse_trace(trace_id=trace_id) + session = mapper.map_to_session(langfuse_traces, trace_id) + agent_response = mapper.get_agent_final_response(session) + + return {"output": str(agent_response), "trajectory": session} + + +test_cases = [ + Case[str, str]( + name="knowledge-1", + input="What is the status of my package", + metadata={"category": "knowledge", "trace_id": "13cc6db7f2e290b54bab3061a6fc5db0"}, + ), +] + + +# 3. Create an evaluator +evaluator = GoalSuccessRateEvaluator() + +# 4. Create a dataset +dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator) + +# 5. Run evaluations +report = dataset.run_evaluations(user_task_function) +report.run_display() diff --git a/src/examples/helpfulness_evaluator_langfuse.py b/src/examples/helpfulness_evaluator_langfuse.py new file mode 100644 index 0000000..5050d43 --- /dev/null +++ b/src/examples/helpfulness_evaluator_langfuse.py @@ -0,0 +1,42 @@ +from langfuse_utils.utils import get_langfuse_trace + +from strands_evals import Case, Dataset +from strands_evals.evaluators import HelpfulnessEvaluator +from strands_evals.mappers.langfuse_mapper import ( + LangfuseSessionMapper, +) + +# ====================================== +# SETUP AND RUN STRANDS EVAL +# ====================================== + + +# 1. Define a task function +def user_task_function(case: Case) -> str: + mapper = LangfuseSessionMapper() + trace_id = case.metadata.get("trace_id", "") if case.metadata else "" + langfuse_traces = get_langfuse_trace(trace_id=trace_id) + session = mapper.map_to_session(langfuse_traces, trace_id) + agent_response = mapper.get_agent_final_response(session) + + return {"output": str(agent_response), "trajectory": session} + + +test_cases = [ + Case[str, str]( + name="knowledge-1", + input="What is the status of my package", + metadata={"category": "knowledge", "trace_id": "13cc6db7f2e290b54bab3061a6fc5db0"}, + ), +] + + +# 3. Create an evaluator +evaluator = HelpfulnessEvaluator() + +# 4. Create a dataset +dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator) + +# 5. Run evaluations +report = dataset.run_evaluations(user_task_function) +report.run_display() diff --git a/src/examples/langfuse_utils/trace.json b/src/examples/langfuse_utils/trace.json new file mode 100644 index 0000000..1b9fa68 --- /dev/null +++ b/src/examples/langfuse_utils/trace.json @@ -0,0 +1,915 @@ +{ + "id": "13cc6db7f2e290b54bab3061a6fc5db0", + "timestamp": "2025-11-14T01:00:01.245000Z", + "name": "run_agent", + "input": [ + { + "role": "user", + "content": "[{\"text\": \"What is the status of my package\"}]" + }, + { + "role": "assistant", + "content": "[{\"text\": \"I'll check the status of your package for you, Bob.\"}, {\"toolUse\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"name\": \"check_package_status\", \"input\": {\"user_name\": \"Bob\"}}}]" + }, + { + "role": "tool", + "content": "[{\"toolResult\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"status\": \"success\", \"content\": [{\"text\": \"Package is ready for pickup\"}]}}]" + }, + { + "role": "assistant", + "content": "[{\"text\": \"Great news! Your package is ready for pickup. Since it's ready, let me also check the total cost for you.\"}, {\"toolUse\": {\"toolUseId\": \"tooluse_eyRoAW-ARTq9inBduvuTtg\", \"name\": \"calculate_price\", \"input\": {}}}]" + }, + { + "role": "tool", + "content": "[{\"toolResult\": {\"toolUseId\": \"tooluse_eyRoAW-ARTq9inBduvuTtg\", \"status\": \"success\", \"content\": [{\"text\": \"Total cost is $100\"}]}}]" + } + ], + "output": { + "message": "[{\"text\": \"Your package is ready for pickup and the total cost is $100. You can come collect it whenever convenient for you!\"}]" + }, + "sessionId": "test_session", + "release": null, + "version": "1.10.0", + "userId": null, + "metadata": { + "attributes": { + "session.id": "test_session", + "gen_ai.event.start_time": "2025-11-14T01:00:04.562535+00:00", + "gen_ai.system": "strands-agents", + "gen_ai.agent.name": "Strands Agents", + "gen_ai.operation.name": "chat", + "gen_ai.request.model": "us.anthropic.claude-sonnet-4-20250514-v1:0", + "gen_ai.agent.tools": "[\"check_package_status\", \"calculate_price\"]", + "system_prompt": "You are a helpful assistant, user will ask you about their package. You should first check it's status. If status is ready for pickup you should check cost. The name of the user is Bob", + "gen_ai.event.end_time": "2025-11-14T01:00:06.456260+00:00", + "gen_ai.usage.prompt_tokens": "653", + "gen_ai.usage.completion_tokens": "27", + "gen_ai.usage.input_tokens": "653", + "gen_ai.usage.output_tokens": "27", + "gen_ai.usage.total_tokens": "680", + "gen_ai.usage.cache_read_input_tokens": "{\"intValue\":0}", + "gen_ai.usage.cache_write_input_tokens": "{\"intValue\":0}", + "event_loop.cycle_id": "48b3f3e3-8549-4126-909d-b2d9a15b938e", + "gen_ai.tool.name": "calculate_price", + "gen_ai.tool.call.id": "tooluse_eyRoAW-ARTq9inBduvuTtg", + "tool.status": "success", + "event_loop.parent_cycle_id": "e46a39a5-f765-4628-9618-acf741d74b77" + }, + "resourceAttributes": { + "telemetry.sdk.language": "python", + "telemetry.sdk.name": "opentelemetry", + "telemetry.sdk.version": "1.35.0", + "service.name": "strands-agents", + "service.version": "1.10.0" + }, + "scope": { + "name": "strands.telemetry.tracer", + "version": "3.9.3", + "attributes": { + "public_key": "pk-lf-5f6607e2-6846-4ce2-97d2-b01375ddd4cb" + } + } + }, + "tags": [], + "public": false, + "environment": "default", + "htmlPath": "/project/cmhenx6m80006z4075iumt8px/traces/13cc6db7f2e290b54bab3061a6fc5db0", + "latency": 5.282, + "totalCost": 0.0, + "observations": [ + { + "id": "ca8183085bc4c831", + "traceId": "13cc6db7f2e290b54bab3061a6fc5db0", + "type": "SPAN", + "name": "run_agent", + "startTime": "2025-11-14T01:00:01.241000Z", + "endTime": "2025-11-14T01:00:06.523000Z", + "completionStartTime": null, + "model": null, + "modelParameters": null, + "input": { + "args": [ + "What is the status of my package" + ], + "kwargs": {} + }, + "version": "1.10.0", + "metadata": { + "resourceAttributes": { + "telemetry.sdk.language": "python", + "telemetry.sdk.name": "opentelemetry", + "telemetry.sdk.version": "1.35.0", + "service.name": "strands-agents", + "service.version": "1.10.0" + }, + "scope": { + "name": "langfuse-sdk", + "version": "3.9.3", + "attributes": { + "public_key": "pk-lf-5f6607e2-6846-4ce2-97d2-b01375ddd4cb" + } + } + }, + "output": null, + "usage": { + "input": 0, + "output": 0, + "total": 0, + "unit": "TOKENS" + }, + "level": "DEFAULT", + "statusMessage": null, + "parentObservationId": null, + "promptId": null, + "usageDetails": {}, + "costDetails": {}, + "environment": "default", + "promptName": null, + "promptVersion": null, + "modelId": null, + "inputPrice": 0.0, + "outputPrice": 0.0, + "totalPrice": 0.0, + "calculatedInputCost": null, + "calculatedOutputCost": null, + "calculatedTotalCost": 0.0, + "latency": 5282.0, + "timeToFirstToken": null, + "promptTokens": 0, + "completionTokens": 0, + "updatedAt": "2025-11-14T01:00:11.193Z", + "totalTokens": 0, + "projectId": "cmhenx6m80006z4075iumt8px", + "unit": "TOKENS", + "createdAt": "2025-11-14T01:00:11.179Z" + }, + { + "id": "28b7038083b46985", + "traceId": "13cc6db7f2e290b54bab3061a6fc5db0", + "type": "GENERATION", + "name": "invoke_agent Strands Agents", + "startTime": "2025-11-14T01:00:01.242000Z", + "endTime": "2025-11-14T01:00:06.495000Z", + "completionStartTime": null, + "model": "us.anthropic.claude-sonnet-4-20250514-v1:0", + "modelParameters": {}, + "input": [ + { + "role": "user", + "content": "[{\"text\": \"What is the status of my package\"}]" + } + ], + "version": "1.10.0", + "metadata": { + "attributes": { + "session.id": "test_session", + "gen_ai.event.start_time": "2025-11-14T01:00:01.242853+00:00", + "gen_ai.system": "strands-agents", + "gen_ai.agent.name": "Strands Agents", + "gen_ai.operation.name": "invoke_agent", + "gen_ai.request.model": "us.anthropic.claude-sonnet-4-20250514-v1:0", + "gen_ai.agent.tools": "[\"check_package_status\", \"calculate_price\"]", + "system_prompt": "You are a helpful assistant, user will ask you about their package. You should first check it's status. If status is ready for pickup you should check cost. The name of the user is Bob", + "gen_ai.event.end_time": "2025-11-14T01:00:06.495171+00:00", + "gen_ai.usage.prompt_tokens": "1709", + "gen_ai.usage.completion_tokens": "164", + "gen_ai.usage.input_tokens": "1709", + "gen_ai.usage.output_tokens": "164", + "gen_ai.usage.total_tokens": "1873", + "gen_ai.usage.cache_read_input_tokens": "{\"intValue\":0}", + "gen_ai.usage.cache_write_input_tokens": "{\"intValue\":0}" + }, + "resourceAttributes": { + "telemetry.sdk.language": "python", + "telemetry.sdk.name": "opentelemetry", + "telemetry.sdk.version": "1.35.0", + "service.name": "strands-agents", + "service.version": "1.10.0" + }, + "scope": { + "name": "strands.telemetry.tracer", + "attributes": {} + } + }, + "output": { + "message": "Your package is ready for pickup and the total cost is $100. You can come collect it whenever convenient for you!\n", + "finish_reason": "end_turn" + }, + "usage": { + "input": 1709, + "output": 164, + "total": 1873, + "unit": "TOKENS" + }, + "level": "DEFAULT", + "statusMessage": null, + "parentObservationId": "ca8183085bc4c831", + "promptId": null, + "usageDetails": { + "input": 1709, + "output": 164, + "total": 1873 + }, + "costDetails": {}, + "environment": "default", + "promptName": null, + "promptVersion": null, + "modelId": null, + "inputPrice": 0.0, + "outputPrice": 0.0, + "totalPrice": 0.0, + "calculatedInputCost": null, + "calculatedOutputCost": null, + "calculatedTotalCost": 0.0, + "latency": 5253.0, + "timeToFirstToken": null, + "promptTokens": 1709, + "completionTokens": 164, + "updatedAt": "2025-11-14T01:00:06.626Z", + "totalTokens": 1873, + "projectId": "cmhenx6m80006z4075iumt8px", + "unit": "TOKENS", + "createdAt": "2025-11-14T01:00:06.601Z" + }, + { + "id": "09a3d2c149be6213", + "traceId": "13cc6db7f2e290b54bab3061a6fc5db0", + "type": "SPAN", + "name": "execute_event_loop_cycle", + "startTime": "2025-11-14T01:00:04.562000Z", + "endTime": "2025-11-14T01:00:06.456000Z", + "completionStartTime": null, + "model": null, + "modelParameters": null, + "input": [ + { + "role": "user", + "content": "[{\"text\": \"What is the status of my package\"}]" + }, + { + "role": "assistant", + "content": "[{\"text\": \"I'll check the status of your package for you, Bob.\"}, {\"toolUse\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"name\": \"check_package_status\", \"input\": {\"user_name\": \"Bob\"}}}]" + }, + { + "role": "tool", + "content": "[{\"toolResult\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"status\": \"success\", \"content\": [{\"text\": \"Package is ready for pickup\"}]}}]" + }, + { + "role": "assistant", + "content": "[{\"text\": \"Great news! Your package is ready for pickup. Since it's ready, let me also check the total cost for you.\"}, {\"toolUse\": {\"toolUseId\": \"tooluse_eyRoAW-ARTq9inBduvuTtg\", \"name\": \"calculate_price\", \"input\": {}}}]" + }, + { + "role": "tool", + "content": "[{\"toolResult\": {\"toolUseId\": \"tooluse_eyRoAW-ARTq9inBduvuTtg\", \"status\": \"success\", \"content\": [{\"text\": \"Total cost is $100\"}]}}]" + } + ], + "version": "1.10.0", + "metadata": { + "attributes": { + "session.id": "test_session", + "gen_ai.event.start_time": "2025-11-14T01:00:04.562535+00:00", + "event_loop.cycle_id": "48b3f3e3-8549-4126-909d-b2d9a15b938e", + "event_loop.parent_cycle_id": "e46a39a5-f765-4628-9618-acf741d74b77", + "gen_ai.event.end_time": "2025-11-14T01:00:06.456260+00:00" + }, + "resourceAttributes": { + "telemetry.sdk.language": "python", + "telemetry.sdk.name": "opentelemetry", + "telemetry.sdk.version": "1.35.0", + "service.name": "strands-agents", + "service.version": "1.10.0" + }, + "scope": { + "name": "strands.telemetry.tracer", + "attributes": {} + } + }, + "output": { + "message": "[{\"text\": \"Your package is ready for pickup and the total cost is $100. You can come collect it whenever convenient for you!\"}]" + }, + "usage": { + "input": 0, + "output": 0, + "total": 0, + "unit": "TOKENS" + }, + "level": "DEFAULT", + "statusMessage": null, + "parentObservationId": "28b7038083b46985", + "promptId": null, + "usageDetails": {}, + "costDetails": {}, + "environment": "default", + "promptName": null, + "promptVersion": null, + "modelId": null, + "inputPrice": 0.0, + "outputPrice": 0.0, + "totalPrice": 0.0, + "calculatedInputCost": null, + "calculatedOutputCost": null, + "calculatedTotalCost": 0.0, + "latency": 1894.0, + "timeToFirstToken": null, + "promptTokens": 0, + "completionTokens": 0, + "updatedAt": "2025-11-14T01:00:06.597Z", + "totalTokens": 0, + "projectId": "cmhenx6m80006z4075iumt8px", + "unit": "TOKENS", + "createdAt": "2025-11-14T01:00:06.569Z" + }, + { + "id": "3f1a15bd9e8cd207", + "traceId": "13cc6db7f2e290b54bab3061a6fc5db0", + "type": "GENERATION", + "name": "chat", + "startTime": "2025-11-14T01:00:04.562000Z", + "endTime": "2025-11-14T01:00:06.384000Z", + "completionStartTime": null, + "model": "us.anthropic.claude-sonnet-4-20250514-v1:0", + "modelParameters": {}, + "input": [ + { + "role": "user", + "content": "[{\"text\": \"What is the status of my package\"}]" + }, + { + "role": "assistant", + "content": "[{\"text\": \"I'll check the status of your package for you, Bob.\"}, {\"toolUse\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"name\": \"check_package_status\", \"input\": {\"user_name\": \"Bob\"}}}]" + }, + { + "role": "tool", + "content": "[{\"toolResult\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"status\": \"success\", \"content\": [{\"text\": \"Package is ready for pickup\"}]}}]" + }, + { + "role": "assistant", + "content": "[{\"text\": \"Great news! Your package is ready for pickup. Since it's ready, let me also check the total cost for you.\"}, {\"toolUse\": {\"toolUseId\": \"tooluse_eyRoAW-ARTq9inBduvuTtg\", \"name\": \"calculate_price\", \"input\": {}}}]" + }, + { + "role": "tool", + "content": "[{\"toolResult\": {\"toolUseId\": \"tooluse_eyRoAW-ARTq9inBduvuTtg\", \"status\": \"success\", \"content\": [{\"text\": \"Total cost is $100\"}]}}]" + } + ], + "version": "1.10.0", + "metadata": { + "attributes": { + "session.id": "test_session", + "gen_ai.event.start_time": "2025-11-14T01:00:04.562625+00:00", + "gen_ai.system": "strands-agents", + "gen_ai.operation.name": "chat", + "gen_ai.request.model": "us.anthropic.claude-sonnet-4-20250514-v1:0", + "gen_ai.event.end_time": "2025-11-14T01:00:06.384344+00:00", + "gen_ai.usage.prompt_tokens": "653", + "gen_ai.usage.input_tokens": "653", + "gen_ai.usage.completion_tokens": "27", + "gen_ai.usage.output_tokens": "27", + "gen_ai.usage.total_tokens": "680", + "gen_ai.usage.cache_read_input_tokens": "{\"intValue\":0}", + "gen_ai.usage.cache_write_input_tokens": "{\"intValue\":0}" + }, + "resourceAttributes": { + "telemetry.sdk.language": "python", + "telemetry.sdk.name": "opentelemetry", + "telemetry.sdk.version": "1.35.0", + "service.name": "strands-agents", + "service.version": "1.10.0" + }, + "scope": { + "name": "strands.telemetry.tracer", + "attributes": {} + } + }, + "output": { + "finish_reason": "end_turn", + "message": "[{\"text\": \"Your package is ready for pickup and the total cost is $100. You can come collect it whenever convenient for you!\"}]" + }, + "usage": { + "input": 653, + "output": 27, + "total": 680, + "unit": "TOKENS" + }, + "level": "DEFAULT", + "statusMessage": null, + "parentObservationId": "09a3d2c149be6213", + "promptId": null, + "usageDetails": { + "input": 653, + "output": 27, + "total": 680 + }, + "costDetails": {}, + "environment": "default", + "promptName": null, + "promptVersion": null, + "modelId": null, + "inputPrice": 0.0, + "outputPrice": 0.0, + "totalPrice": 0.0, + "calculatedInputCost": null, + "calculatedOutputCost": null, + "calculatedTotalCost": 0.0, + "latency": 1822.0, + "timeToFirstToken": null, + "promptTokens": 653, + "completionTokens": 27, + "updatedAt": "2025-11-14T01:00:06.520Z", + "totalTokens": 680, + "projectId": "cmhenx6m80006z4075iumt8px", + "unit": "TOKENS", + "createdAt": "2025-11-14T01:00:06.490Z" + }, + { + "id": "5bd8962273e7ddba", + "traceId": "13cc6db7f2e290b54bab3061a6fc5db0", + "type": "SPAN", + "name": "execute_event_loop_cycle", + "startTime": "2025-11-14T01:00:03.135000Z", + "endTime": "2025-11-14T01:00:04.533000Z", + "completionStartTime": null, + "model": null, + "modelParameters": null, + "input": [ + { + "role": "user", + "content": "[{\"text\": \"What is the status of my package\"}]" + }, + { + "role": "assistant", + "content": "[{\"text\": \"I'll check the status of your package for you, Bob.\"}, {\"toolUse\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"name\": \"check_package_status\", \"input\": {\"user_name\": \"Bob\"}}}]" + }, + { + "role": "tool", + "content": "[{\"toolResult\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"status\": \"success\", \"content\": [{\"text\": \"Package is ready for pickup\"}]}}]" + } + ], + "version": "1.10.0", + "metadata": { + "attributes": { + "session.id": "test_session", + "gen_ai.event.start_time": "2025-11-14T01:00:03.135597+00:00", + "event_loop.cycle_id": "e46a39a5-f765-4628-9618-acf741d74b77", + "event_loop.parent_cycle_id": "b7e8bff4-37b4-41d2-b939-9547f56ccb24", + "gen_ai.event.end_time": "2025-11-14T01:00:04.533362+00:00" + }, + "resourceAttributes": { + "telemetry.sdk.language": "python", + "telemetry.sdk.name": "opentelemetry", + "telemetry.sdk.version": "1.35.0", + "service.name": "strands-agents", + "service.version": "1.10.0" + }, + "scope": { + "name": "strands.telemetry.tracer", + "attributes": {} + } + }, + "output": { + "message": "[{\"text\": \"Great news! Your package is ready for pickup. Since it's ready, let me also check the total cost for you.\"}, {\"toolUse\": {\"toolUseId\": \"tooluse_eyRoAW-ARTq9inBduvuTtg\", \"name\": \"calculate_price\", \"input\": {}}}]", + "tool.result": "[{\"toolResult\": {\"toolUseId\": \"tooluse_eyRoAW-ARTq9inBduvuTtg\", \"status\": \"success\", \"content\": [{\"text\": \"Total cost is $100\"}]}}]" + }, + "usage": { + "input": 0, + "output": 0, + "total": 0, + "unit": "TOKENS" + }, + "level": "DEFAULT", + "statusMessage": null, + "parentObservationId": "28b7038083b46985", + "promptId": null, + "usageDetails": {}, + "costDetails": {}, + "environment": "default", + "promptName": null, + "promptVersion": null, + "modelId": null, + "inputPrice": 0.0, + "outputPrice": 0.0, + "totalPrice": 0.0, + "calculatedInputCost": null, + "calculatedOutputCost": null, + "calculatedTotalCost": 0.0, + "latency": 1398.0, + "timeToFirstToken": null, + "promptTokens": 0, + "completionTokens": 0, + "updatedAt": "2025-11-14T01:00:04.666Z", + "totalTokens": 0, + "projectId": "cmhenx6m80006z4075iumt8px", + "unit": "TOKENS", + "createdAt": "2025-11-14T01:00:04.646Z" + }, + { + "id": "364c5b004ed7f062", + "traceId": "13cc6db7f2e290b54bab3061a6fc5db0", + "type": "SPAN", + "name": "execute_tool calculate_price", + "startTime": "2025-11-14T01:00:04.479000Z", + "endTime": "2025-11-14T01:00:04.480000Z", + "completionStartTime": null, + "model": null, + "modelParameters": null, + "input": [ + { + "role": "tool", + "content": "{}", + "id": "tooluse_eyRoAW-ARTq9inBduvuTtg" + } + ], + "version": "1.10.0", + "metadata": { + "attributes": { + "session.id": "test_session", + "gen_ai.event.start_time": "2025-11-14T01:00:04.480024+00:00", + "gen_ai.operation.name": "execute_tool", + "gen_ai.system": "strands-agents", + "gen_ai.tool.name": "calculate_price", + "gen_ai.tool.call.id": "tooluse_eyRoAW-ARTq9inBduvuTtg", + "gen_ai.event.end_time": "2025-11-14T01:00:04.480406+00:00", + "tool.status": "success" + }, + "resourceAttributes": { + "telemetry.sdk.language": "python", + "telemetry.sdk.name": "opentelemetry", + "telemetry.sdk.version": "1.35.0", + "service.name": "strands-agents", + "service.version": "1.10.0" + }, + "scope": { + "name": "strands.telemetry.tracer", + "attributes": {} + } + }, + "output": { + "message": "[{\"text\": \"Total cost is $100\"}]", + "id": "tooluse_eyRoAW-ARTq9inBduvuTtg" + }, + "usage": { + "input": 0, + "output": 0, + "total": 0, + "unit": "TOKENS" + }, + "level": "DEFAULT", + "statusMessage": null, + "parentObservationId": "5bd8962273e7ddba", + "promptId": null, + "usageDetails": {}, + "costDetails": {}, + "environment": "default", + "promptName": null, + "promptVersion": null, + "modelId": null, + "inputPrice": 0.0, + "outputPrice": 0.0, + "totalPrice": 0.0, + "calculatedInputCost": null, + "calculatedOutputCost": null, + "calculatedTotalCost": 0.0, + "latency": 1.0, + "timeToFirstToken": null, + "promptTokens": 0, + "completionTokens": 0, + "updatedAt": "2025-11-14T01:00:04.641Z", + "totalTokens": 0, + "projectId": "cmhenx6m80006z4075iumt8px", + "unit": "TOKENS", + "createdAt": "2025-11-14T01:00:04.589Z" + }, + { + "id": "290b5c80c9aed3f3", + "traceId": "13cc6db7f2e290b54bab3061a6fc5db0", + "type": "GENERATION", + "name": "chat", + "startTime": "2025-11-14T01:00:03.135000Z", + "endTime": "2025-11-14T01:00:04.410000Z", + "completionStartTime": null, + "model": "us.anthropic.claude-sonnet-4-20250514-v1:0", + "modelParameters": {}, + "input": [ + { + "role": "user", + "content": "[{\"text\": \"What is the status of my package\"}]" + }, + { + "role": "assistant", + "content": "[{\"text\": \"I'll check the status of your package for you, Bob.\"}, {\"toolUse\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"name\": \"check_package_status\", \"input\": {\"user_name\": \"Bob\"}}}]" + }, + { + "role": "tool", + "content": "[{\"toolResult\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"status\": \"success\", \"content\": [{\"text\": \"Package is ready for pickup\"}]}}]" + } + ], + "version": "1.10.0", + "metadata": { + "attributes": { + "session.id": "test_session", + "gen_ai.event.start_time": "2025-11-14T01:00:03.135749+00:00", + "gen_ai.system": "strands-agents", + "gen_ai.operation.name": "chat", + "gen_ai.request.model": "us.anthropic.claude-sonnet-4-20250514-v1:0", + "gen_ai.event.end_time": "2025-11-14T01:00:04.410065+00:00", + "gen_ai.usage.prompt_tokens": "574", + "gen_ai.usage.input_tokens": "574", + "gen_ai.usage.completion_tokens": "62", + "gen_ai.usage.output_tokens": "62", + "gen_ai.usage.total_tokens": "636", + "gen_ai.usage.cache_read_input_tokens": "{\"intValue\":0}", + "gen_ai.usage.cache_write_input_tokens": "{\"intValue\":0}" + }, + "resourceAttributes": { + "telemetry.sdk.language": "python", + "telemetry.sdk.name": "opentelemetry", + "telemetry.sdk.version": "1.35.0", + "service.name": "strands-agents", + "service.version": "1.10.0" + }, + "scope": { + "name": "strands.telemetry.tracer", + "attributes": {} + } + }, + "output": { + "finish_reason": "tool_use", + "message": "[{\"text\": \"Great news! Your package is ready for pickup. Since it's ready, let me also check the total cost for you.\"}, {\"toolUse\": {\"toolUseId\": \"tooluse_eyRoAW-ARTq9inBduvuTtg\", \"name\": \"calculate_price\", \"input\": {}}}]" + }, + "usage": { + "input": 574, + "output": 62, + "total": 636, + "unit": "TOKENS" + }, + "level": "DEFAULT", + "statusMessage": null, + "parentObservationId": "5bd8962273e7ddba", + "promptId": null, + "usageDetails": { + "input": 574, + "output": 62, + "total": 636 + }, + "costDetails": {}, + "environment": "default", + "promptName": null, + "promptVersion": null, + "modelId": null, + "inputPrice": 0.0, + "outputPrice": 0.0, + "totalPrice": 0.0, + "calculatedInputCost": null, + "calculatedOutputCost": null, + "calculatedTotalCost": 0.0, + "latency": 1275.0, + "timeToFirstToken": null, + "promptTokens": 574, + "completionTokens": 62, + "updatedAt": "2025-11-14T01:00:04.544Z", + "totalTokens": 636, + "projectId": "cmhenx6m80006z4075iumt8px", + "unit": "TOKENS", + "createdAt": "2025-11-14T01:00:04.520Z" + }, + { + "id": "9596bbd4668349ad", + "traceId": "13cc6db7f2e290b54bab3061a6fc5db0", + "type": "SPAN", + "name": "execute_event_loop_cycle", + "startTime": "2025-11-14T01:00:01.245000Z", + "endTime": "2025-11-14T01:00:03.086000Z", + "completionStartTime": null, + "model": null, + "modelParameters": null, + "input": [ + { + "role": "user", + "content": "[{\"text\": \"What is the status of my package\"}]" + } + ], + "version": "1.10.0", + "metadata": { + "attributes": { + "session.id": "test_session", + "gen_ai.event.start_time": "2025-11-14T01:00:01.245201+00:00", + "event_loop.cycle_id": "b7e8bff4-37b4-41d2-b939-9547f56ccb24", + "gen_ai.event.end_time": "2025-11-14T01:00:03.086492+00:00" + }, + "resourceAttributes": { + "telemetry.sdk.language": "python", + "telemetry.sdk.name": "opentelemetry", + "telemetry.sdk.version": "1.35.0", + "service.name": "strands-agents", + "service.version": "1.10.0" + }, + "scope": { + "name": "strands.telemetry.tracer", + "attributes": {} + } + }, + "output": { + "message": "[{\"text\": \"I'll check the status of your package for you, Bob.\"}, {\"toolUse\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"name\": \"check_package_status\", \"input\": {\"user_name\": \"Bob\"}}}]", + "tool.result": "[{\"toolResult\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"status\": \"success\", \"content\": [{\"text\": \"Package is ready for pickup\"}]}}]" + }, + "usage": { + "input": 0, + "output": 0, + "total": 0, + "unit": "TOKENS" + }, + "level": "DEFAULT", + "statusMessage": null, + "parentObservationId": "28b7038083b46985", + "promptId": null, + "usageDetails": {}, + "costDetails": {}, + "environment": "default", + "promptName": null, + "promptVersion": null, + "modelId": null, + "inputPrice": 0.0, + "outputPrice": 0.0, + "totalPrice": 0.0, + "calculatedInputCost": null, + "calculatedOutputCost": null, + "calculatedTotalCost": 0.0, + "latency": 1841.0, + "timeToFirstToken": null, + "promptTokens": 0, + "completionTokens": 0, + "updatedAt": "2025-11-14T01:00:03.188Z", + "totalTokens": 0, + "projectId": "cmhenx6m80006z4075iumt8px", + "unit": "TOKENS", + "createdAt": "2025-11-14T01:00:03.174Z" + }, + { + "id": "bcad6196d9821f76", + "traceId": "13cc6db7f2e290b54bab3061a6fc5db0", + "type": "SPAN", + "name": "execute_tool check_package_status", + "startTime": "2025-11-14T01:00:03.022000Z", + "endTime": "2025-11-14T01:00:03.023000Z", + "completionStartTime": null, + "model": null, + "modelParameters": null, + "input": [ + { + "role": "tool", + "content": "{\"user_name\": \"Bob\"}", + "id": "tooluse_sYup7HMFTFOAopB_4kgTqA" + } + ], + "version": "1.10.0", + "metadata": { + "attributes": { + "session.id": "test_session", + "gen_ai.event.start_time": "2025-11-14T01:00:03.022945+00:00", + "gen_ai.operation.name": "execute_tool", + "gen_ai.system": "strands-agents", + "gen_ai.tool.name": "check_package_status", + "gen_ai.tool.call.id": "tooluse_sYup7HMFTFOAopB_4kgTqA", + "gen_ai.event.end_time": "2025-11-14T01:00:03.023720+00:00", + "tool.status": "success" + }, + "resourceAttributes": { + "telemetry.sdk.language": "python", + "telemetry.sdk.name": "opentelemetry", + "telemetry.sdk.version": "1.35.0", + "service.name": "strands-agents", + "service.version": "1.10.0" + }, + "scope": { + "name": "strands.telemetry.tracer", + "attributes": {} + } + }, + "output": { + "message": "[{\"text\": \"Package is ready for pickup\"}]", + "id": "tooluse_sYup7HMFTFOAopB_4kgTqA" + }, + "usage": { + "input": 0, + "output": 0, + "total": 0, + "unit": "TOKENS" + }, + "level": "DEFAULT", + "statusMessage": null, + "parentObservationId": "9596bbd4668349ad", + "promptId": null, + "usageDetails": {}, + "costDetails": {}, + "environment": "default", + "promptName": null, + "promptVersion": null, + "modelId": null, + "inputPrice": 0.0, + "outputPrice": 0.0, + "totalPrice": 0.0, + "calculatedInputCost": null, + "calculatedOutputCost": null, + "calculatedTotalCost": 0.0, + "latency": 1.0, + "timeToFirstToken": null, + "promptTokens": 0, + "completionTokens": 0, + "updatedAt": "2025-11-14T01:00:03.163Z", + "totalTokens": 0, + "projectId": "cmhenx6m80006z4075iumt8px", + "unit": "TOKENS", + "createdAt": "2025-11-14T01:00:03.130Z" + }, + { + "id": "156c1779bdf680d1", + "traceId": "13cc6db7f2e290b54bab3061a6fc5db0", + "type": "GENERATION", + "name": "chat", + "startTime": "2025-11-14T01:00:01.245000Z", + "endTime": "2025-11-14T01:00:02.922000Z", + "completionStartTime": null, + "model": "us.anthropic.claude-sonnet-4-20250514-v1:0", + "modelParameters": {}, + "input": [ + { + "role": "user", + "content": "[{\"text\": \"What is the status of my package\"}]" + } + ], + "version": "1.10.0", + "metadata": { + "attributes": { + "session.id": "test_session", + "gen_ai.event.start_time": "2025-11-14T01:00:01.245246+00:00", + "gen_ai.system": "strands-agents", + "gen_ai.operation.name": "chat", + "gen_ai.request.model": "us.anthropic.claude-sonnet-4-20250514-v1:0", + "gen_ai.event.end_time": "2025-11-14T01:00:02.922311+00:00", + "gen_ai.usage.prompt_tokens": "482", + "gen_ai.usage.input_tokens": "482", + "gen_ai.usage.completion_tokens": "75", + "gen_ai.usage.output_tokens": "75", + "gen_ai.usage.total_tokens": "557", + "gen_ai.usage.cache_read_input_tokens": "{\"intValue\":0}", + "gen_ai.usage.cache_write_input_tokens": "{\"intValue\":0}" + }, + "resourceAttributes": { + "telemetry.sdk.language": "python", + "telemetry.sdk.name": "opentelemetry", + "telemetry.sdk.version": "1.35.0", + "service.name": "strands-agents", + "service.version": "1.10.0" + }, + "scope": { + "name": "strands.telemetry.tracer", + "attributes": {} + } + }, + "output": { + "finish_reason": "tool_use", + "message": "[{\"text\": \"I'll check the status of your package for you, Bob.\"}, {\"toolUse\": {\"toolUseId\": \"tooluse_sYup7HMFTFOAopB_4kgTqA\", \"name\": \"check_package_status\", \"input\": {\"user_name\": \"Bob\"}}}]" + }, + "usage": { + "input": 482, + "output": 75, + "total": 557, + "unit": "TOKENS" + }, + "level": "DEFAULT", + "statusMessage": null, + "parentObservationId": "9596bbd4668349ad", + "promptId": null, + "usageDetails": { + "input": 482, + "output": 75, + "total": 557 + }, + "costDetails": {}, + "environment": "default", + "promptName": null, + "promptVersion": null, + "modelId": null, + "inputPrice": 0.0, + "outputPrice": 0.0, + "totalPrice": 0.0, + "calculatedInputCost": null, + "calculatedOutputCost": null, + "calculatedTotalCost": 0.0, + "latency": 1677.0, + "timeToFirstToken": null, + "promptTokens": 482, + "completionTokens": 75, + "updatedAt": "2025-11-14T01:00:03.112Z", + "totalTokens": 557, + "projectId": "cmhenx6m80006z4075iumt8px", + "unit": "TOKENS", + "createdAt": "2025-11-14T01:00:03.069Z" + } + ], + "scores": [], + "updatedAt": "2025-11-14T01:00:11.186Z", + "projectId": "cmhenx6m80006z4075iumt8px", + "bookmarked": false, + "externalId": null, + "createdAt": "2025-11-14T01:00:03.109Z" +} \ No newline at end of file diff --git a/src/examples/langfuse_utils/utils.py b/src/examples/langfuse_utils/utils.py new file mode 100644 index 0000000..17a52ba --- /dev/null +++ b/src/examples/langfuse_utils/utils.py @@ -0,0 +1,62 @@ +import json +import logging +import os + +from langfuse import Langfuse +from langfuse.api.resources.commons.types.trace_with_full_details import TraceWithFullDetails + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def _load_from_json() -> TraceWithFullDetails: + """Load trace from local JSON file and parse into TraceWithFullDetails.""" + file_path = os.path.join(os.path.dirname(__file__), "trace.json") + with open(file_path, "r") as f: + trace_data = json.load(f) + return TraceWithFullDetails(**trace_data) + + +def get_langfuse_trace(trace_id: str) -> list[TraceWithFullDetails]: + # Check if env vars are set, if not fallback to JSON + if not all(os.environ.get(key) for key in ["LANGFUSE_PUBLIC_KEY", "LANGFUSE_SECRET_KEY", "LANGFUSE_BASE_URL"]): + logger.info("Langfuse environment variables not set. Loading from local trace.json") + return [_load_from_json()] + + try: + # Initialize Langfuse client + langfuse = Langfuse( + public_key=os.environ.get("LANGFUSE_PUBLIC_KEY", ""), + secret_key=os.environ.get("LANGFUSE_SECRET_KEY", ""), + host=os.environ.get("LANGFUSE_BASE_URL", ""), + ) + # Download trace + trace = langfuse.api.trace.get(trace_id) + return [trace] + except Exception as e: + logger.warning(f"Langfuse fetch failed: {e}. Loading from local trace.json") + return [_load_from_json()] + + +def get_langfuse_session(session_id: str) -> list[TraceWithFullDetails]: + # Check if env vars are set, if not fallback to JSON + if not all(os.environ.get(key) for key in ["LANGFUSE_PUBLIC_KEY", "LANGFUSE_SECRET_KEY", "LANGFUSE_BASE_URL"]): + logger.info("Langfuse environment variables not set. Loading from local trace.json") + return [_load_from_json()] + + try: + # Initialize Langfuse client + langfuse = Langfuse( + public_key=os.environ.get("LANGFUSE_PUBLIC_KEY", ""), + secret_key=os.environ.get("LANGFUSE_SECRET_KEY", ""), + host=os.environ.get("LANGFUSE_BASE_URL", ""), + ) + session = langfuse.api.sessions.get(session_id) + traces = [] + for trace in session.traces: + traces.append(langfuse.api.trace.get(trace.id)) + traces.sort(key=lambda x: x.timestamp) + return traces + except Exception as e: + logger.warning(f"Langfuse session fetch failed: {e}. Loading from local trace.json") + return [_load_from_json()] diff --git a/src/strands_evals/mappers/langfuse_mapper.py b/src/strands_evals/mappers/langfuse_mapper.py new file mode 100644 index 0000000..94bbd56 --- /dev/null +++ b/src/strands_evals/mappers/langfuse_mapper.py @@ -0,0 +1,328 @@ +"""Convert Langfuse TraceWithFullDetails to Strands Session object.""" + +import json +import logging +from typing import Any + +from langfuse.api.resources.commons.types.trace_with_full_details import ( # type: ignore[import-not-found] + TraceWithFullDetails, +) + +from ..types.trace import ( + AgentInvocationSpan, + AssistantMessage, + InferenceSpan, + Session, + SpanInfo, + TextContent, + ToolCall, + ToolCallContent, + ToolConfig, + ToolExecutionSpan, + ToolResult, + ToolResultContent, + Trace, + UserMessage, +) +from .session_mapper import SessionMapper + +logger = logging.getLogger(__name__) + + +class LangfuseSessionMapper(SessionMapper): + """Maps Langfuse TraceWithFullDetails to Session format for evaluation.""" + + def map_to_session(self, spans: list[TraceWithFullDetails], session_id: str) -> Session: + """Map Langfuse traces to Session format.""" + traces = [] + for langfuse_trace in spans: + try: + trace = self._convert_trace(langfuse_trace) + if trace.spans: + traces.append(trace) + except Exception as e: + logger.warning(f"Failed to convert trace {langfuse_trace.id}: {e}") + + return Session(traces=traces, session_id=session_id) + + def get_agent_final_response(self, session: Session) -> str: + """Get the final response from the agent.""" + final_response = "" + for trace in session.traces: + for span in trace.spans: + if isinstance(span, AgentInvocationSpan): + final_response = span.agent_response + return final_response + + def _convert_trace(self, langfuse_trace: TraceWithFullDetails) -> Trace: + """Convert Langfuse TraceWithFullDetails to Strands Trace.""" + spans: list[InferenceSpan | ToolExecutionSpan | AgentInvocationSpan] = [] + sorted_observations = sorted( + langfuse_trace.observations, + key=lambda obs: obs.start_time, + ) + + for obs in sorted_observations: + try: + span_info = self._create_span_info(obs, langfuse_trace) + operation_name = self._get_operation_name(obs) + + if operation_name == "chat": + span = self._convert_inference_span(obs, span_info) + if span.messages: + spans.append(span) + elif operation_name == "execute_tool": + spans.append(self._convert_tool_execution_span(obs, span_info)) + elif operation_name == "invoke_agent": + spans.append(self._convert_agent_invocation_span(obs, span_info)) + except Exception as e: + logger.warning(f"Failed to convert span {obs.id}: {e}") + + return Trace( + spans=spans, + trace_id=langfuse_trace.id, + session_id=langfuse_trace.session_id or langfuse_trace.id, + ) + + def _create_span_info(self, obs: Any, langfuse_trace: TraceWithFullDetails) -> SpanInfo: + """Create SpanInfo from observation.""" + return SpanInfo( + trace_id=langfuse_trace.id, + span_id=obs.id, + session_id=langfuse_trace.session_id or langfuse_trace.id, + parent_span_id=obs.parent_observation_id, + start_time=obs.start_time, + end_time=obs.end_time or obs.start_time, + ) + + def _get_operation_name(self, obs: Any) -> str: + """Extract operation name from observation metadata.""" + try: + return obs.metadata["attributes"].get("gen_ai.operation.name", "") if obs.metadata else "" + except (AttributeError, TypeError, KeyError): + return "" + + def _parse_json_attr(self, value: str, default: str = "[]") -> Any: + """Parse JSON attribute with error handling.""" + try: + return json.loads(value) + except (json.JSONDecodeError, TypeError): + return json.loads(default) + + def _extract_text_from_content(self, content_list: list[dict[str, Any]]) -> str: + """Extract text from content list.""" + for content_item in content_list: + if "text" in content_item: + return content_item["text"] + return "" + + def _parse_langfuse_messages(self, input_data: list[dict[str, Any]]) -> list[UserMessage | AssistantMessage]: + """Parse Langfuse message format to Strands message format.""" + messages: list[UserMessage | AssistantMessage] = [] + + for item in input_data: + try: + role = item.get("role", "") + content_str = item.get("content", "") + + content_list = self._parse_json_attr(content_str, '[{"text": ""}]') + + if role == "user": + user_content = self._process_user_content(content_list) + if user_content: + messages.append(UserMessage(content=user_content)) + + elif role == "assistant": + assistant_content = self._process_assistant_content(content_list) + if assistant_content: + messages.append(AssistantMessage(content=assistant_content)) + + elif role == "tool": + tool_content = self._process_tool_content(content_list) + if tool_content: + messages.append(UserMessage(content=list(tool_content))) + except Exception as e: + logger.warning(f"Failed to parse message with role {item.get('role', 'unknown')}: {e}") + + return messages + + def _process_user_content(self, content_list: list[dict[str, Any]]) -> list[TextContent | ToolResultContent]: + """Process user message content.""" + user_content: list[TextContent | ToolResultContent] = [] + for content_item in content_list: + if "text" in content_item: + user_content.append(TextContent(text=content_item["text"])) + elif "toolResult" in content_item: + tool_result = content_item["toolResult"] + result_text = self._extract_tool_result_text(tool_result) + user_content.append( + ToolResultContent( + content=result_text, + tool_call_id=tool_result.get("toolUseId"), + ) + ) + return user_content + + def _process_assistant_content(self, content_list: list[dict[str, Any]]) -> list[TextContent | ToolCallContent]: + """Process assistant message content.""" + assistant_content: list[TextContent | ToolCallContent] = [] + for content_item in content_list: + if "text" in content_item: + assistant_content.append(TextContent(text=content_item["text"])) + elif "toolUse" in content_item: + tool_use = content_item["toolUse"] + assistant_content.append( + ToolCallContent( + name=tool_use["name"], + arguments=tool_use.get("input", {}), + tool_call_id=tool_use.get("toolUseId"), + ) + ) + return assistant_content + + def _process_tool_content(self, content_list: list[dict[str, Any]]) -> list[ToolResultContent]: + """Process tool message content.""" + tool_content = [] + for content_item in content_list: + if "toolResult" in content_item: + tool_result = content_item["toolResult"] + result_text = self._extract_tool_result_text(tool_result) + tool_content.append( + ToolResultContent( + content=result_text, + tool_call_id=tool_result.get("toolUseId"), + ) + ) + return tool_content + + def _extract_tool_result_text(self, tool_result: dict[str, Any]) -> str: + """Extract text from tool result.""" + if "content" in tool_result and tool_result["content"]: + content = tool_result["content"] + if isinstance(content, list) and content: + return content[0].get("text", "") if isinstance(content[0], dict) else str(content[0]) + return str(content) + return "" + + def _parse_output_messages(self, output: dict[str, Any]) -> list[AssistantMessage | UserMessage]: + """Parse output messages from inference span.""" + messages: list[AssistantMessage | UserMessage] = [] + + if "message" in output: + content_list = self._parse_json_attr(output["message"]) + assistant_content = self._process_assistant_content(content_list) + if assistant_content: + messages.append(AssistantMessage(content=assistant_content)) + + if "tool.result" in output: + content_list = self._parse_json_attr(output["tool.result"]) + user_content = self._process_user_content(content_list) + if user_content: + messages.append(UserMessage(content=user_content)) + + return messages + + def _convert_inference_span(self, obs: Any, span_info: SpanInfo) -> InferenceSpan: + """Convert observation to InferenceSpan.""" + messages = [] + if obs.input: + messages.extend(self._parse_langfuse_messages(obs.input)) + + if obs.output: + messages.extend(self._parse_output_messages(obs.output)) + return InferenceSpan(span_info=span_info, messages=messages) + + def _convert_tool_execution_span(self, obs: Any, span_info: SpanInfo) -> ToolExecutionSpan: + """Convert observation to ToolExecutionSpan.""" + + tool_arguments = {} + + try: + tool_arguments = json.loads(obs.input[0]["content"]) + except Exception as e: + logger.warning(f"Failed to extract tool arguments: {e}") + + tool_call = ToolCall( + name=obs.name or "unknown_tool", + arguments=tool_arguments, + tool_call_id=obs.id, + ) + + tool_content = self._extract_tool_output(obs.output) + tool_result = ToolResult(content=tool_content, tool_call_id=obs.id) + + return ToolExecutionSpan(span_info=span_info, tool_call=tool_call, tool_result=tool_result) + + def _extract_tool_output(self, output: Any) -> str: + """Extract tool output content.""" + if not output: + return "" + + try: + if isinstance(output, dict) and "message" in output: + content_list = self._parse_json_attr(output["message"]) + return self._extract_text_from_content(content_list) + return str(output) + except Exception as e: + logger.warning(f"Failed to extract tool output: {e}") + return str(output) if output else "" + + def _convert_agent_invocation_span(self, obs: Any, span_info: SpanInfo) -> AgentInvocationSpan: + """Convert observation to AgentInvocationSpan.""" + user_prompt = self._extract_user_prompt(obs.input) + agent_response = self._extract_agent_response(obs.output) + available_tools = self._extract_available_tools(obs.metadata) + + return AgentInvocationSpan( + span_info=span_info, + user_prompt=user_prompt, + agent_response=agent_response, + available_tools=available_tools, + ) + + def _extract_user_prompt(self, input_data: Any) -> str: + """Extract user prompt from input data.""" + if not input_data: + return "" + + try: + for item in input_data: + if item.get("role") == "user": + content_list = self._parse_json_attr(item.get("content", "[]")) + return self._extract_text_from_content(content_list) + except Exception as e: + logger.warning(f"Failed to extract user prompt: {e}") + return "" + + def _extract_agent_response(self, output: Any) -> str: + """Extract agent response from output.""" + if not output: + return "" + + try: + if isinstance(output, str): + return output + elif isinstance(output, dict) and "message" in output: + return output["message"] + return str(output) + except Exception as e: + logger.warning(f"Failed to extract agent response: {e}") + return str(output) if output else "" + + def _extract_available_tools(self, metadata: Any) -> list[ToolConfig]: + """Extract available tools from metadata.""" + available_tools = [] + try: + if metadata and "attributes" in metadata: + tools = self._parse_json_attr(metadata["attributes"].get("gen_ai.agent.tools", "[]")) + for tool_data in tools: + available_tools.append(ToolConfig(name=tool_data)) + except Exception as e: + logger.warning(f"Failed to extract available tools: {e}") + return available_tools + + +def convert_trace_to_session(langfuse_trace: TraceWithFullDetails) -> Session: + """Convert Langfuse TraceWithFullDetails to Strands Session.""" + mapper = LangfuseSessionMapper() + return mapper.map_to_session([langfuse_trace], langfuse_trace.session_id or langfuse_trace.id) diff --git a/tests/strands_evals/mappers/test_langfuse_mapper.py b/tests/strands_evals/mappers/test_langfuse_mapper.py new file mode 100644 index 0000000..9daca59 --- /dev/null +++ b/tests/strands_evals/mappers/test_langfuse_mapper.py @@ -0,0 +1,179 @@ +from datetime import datetime, timezone +from unittest.mock import Mock + +from strands_evals.mappers.langfuse_mapper import LangfuseSessionMapper +from strands_evals.types.trace import AgentInvocationSpan, InferenceSpan, ToolExecutionSpan + + +def create_mock_observation(obs_id, name, operation_name, input_data=None, output=None, metadata=None): + """Create a mock Langfuse observation.""" + obs = Mock() + obs.id = obs_id + obs.name = name + obs.input = input_data + obs.output = output + obs.metadata = metadata or {"attributes": {"gen_ai.operation.name": operation_name}} + obs.start_time = datetime.now(timezone.utc) + obs.end_time = datetime.now(timezone.utc) + obs.created_at = datetime.now(timezone.utc) + obs.parent_observation_id = None + return obs + + +def create_mock_trace(trace_id, session_id, observations): + """Create a mock Langfuse TraceWithFullDetails.""" + trace = Mock() + trace.id = trace_id + trace.session_id = session_id + trace.observations = observations + return trace + + +def test_inference_span(): + obs = create_mock_observation( + "obs1", + "chat", + "chat", + input_data=[ + {"role": "user", "content": '[{"text": "hello"}]'}, + ], + output={"message": '[{"text": "hi"}]'}, + ) + + trace = create_mock_trace("trace1", "session1", [obs]) + + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + inference = session.traces[0].spans[0] + assert isinstance(inference, InferenceSpan) + assert inference.messages[0].content[0].text == "hello" + assert inference.messages[1].content[0].text == "hi" + + +def test_agent_invocation_span(): + obs = create_mock_observation( + "obs1", + "invoke_agent", + "invoke_agent", + input_data=[{"role": "user", "content": '[{"text": "2+2"}]'}], + output={"message": "4"}, + metadata={"attributes": {"gen_ai.operation.name": "invoke_agent", "gen_ai.agent.tools": '["calc"]'}}, + ) + + trace = create_mock_trace("trace1", "session1", [obs]) + + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + agent = session.traces[0].spans[0] + assert isinstance(agent, AgentInvocationSpan) + assert agent.user_prompt == "2+2" + assert agent.agent_response == "4" + assert agent.available_tools[0].name == "calc" + + +def test_tool_execution_span(): + obs = create_mock_observation( + "obs1", + "calculate_price", + "execute_tool", + input_data=[{"role": "tool", "content": '{"expr": "2+2"}', "id": "t1"}], + output={"message": '[{"text": "4"}]', "id": "t1"}, + ) + + trace = create_mock_trace("trace1", "session1", [obs]) + + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + tool = session.traces[0].spans[0] + assert isinstance(tool, ToolExecutionSpan) + assert tool.tool_call.name == "calculate_price" + assert tool.tool_result.content == "4" + + +def test_tool_use_in_message(): + obs = create_mock_observation( + "obs1", + "chat", + "chat", + input_data=[ + {"role": "user", "content": '[{"text": "calc"}]'}, + {"role": "assistant", "content": '[{"toolUse": {"toolUseId": "t1", "name": "calc", "input": {"x": 1}}}]'}, + ], + ) + + trace = create_mock_trace("trace1", "session1", [obs]) + + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + msg = session.traces[0].spans[0].messages[1] + assert msg.content[0].name == "calc" + assert msg.content[0].tool_call_id == "t1" + + +def test_tool_result_in_message(): + obs = create_mock_observation( + "obs1", + "chat", + "chat", + input_data=[ + {"role": "tool", "content": '[{"toolResult": {"toolUseId": "t1", "content": [{"text": "result"}]}}]'} + ], + ) + + trace = create_mock_trace("trace1", "session1", [obs]) + + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + msg = session.traces[0].spans[0].messages[0] + assert msg.content[0].content == "result" + assert msg.content[0].tool_call_id == "t1" + + +def test_multiple_traces(): + obs1 = create_mock_observation( + "obs1", "chat", "chat", input_data=[{"role": "assistant", "content": '[{"text": "a"}]'}] + ) + obs2 = create_mock_observation( + "obs2", "chat", "chat", input_data=[{"role": "assistant", "content": '[{"text": "b"}]'}] + ) + + trace1 = create_mock_trace("trace1", "session1", [obs1]) + trace2 = create_mock_trace("trace2", "session1", [obs2]) + + session = LangfuseSessionMapper().map_to_session([trace1, trace2], "session1") + + assert len(session.traces) == 2 + + +def test_get_agent_final_response(): + obs = create_mock_observation("obs1", "invoke_agent", "invoke_agent", output={"message": "final response"}) + + trace = create_mock_trace("trace1", "session1", [obs]) + + mapper = LangfuseSessionMapper() + session = mapper.map_to_session([trace], "session1") + + final_response = mapper.get_agent_final_response(session) + assert final_response == "final response" + + +def test_empty_observations(): + trace = create_mock_trace("trace1", "session1", []) + + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + assert len(session.traces) == 0 + + +def test_invalid_json_handling(): + obs = create_mock_observation("obs1", "chat", "chat", input_data=[{"role": "user", "content": "invalid json"}]) + + trace = create_mock_trace("trace1", "session1", [obs]) + + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + # Should handle gracefully and create message with empty text + inference = session.traces[0].spans[0] + assert isinstance(inference, InferenceSpan) + assert len(inference.messages) == 1 + assert inference.messages[0].content[0].text == "" From 8376c57af20b33f71993143d0da44612635e5bfa Mon Sep 17 00:00:00 2001 From: Deepak Dalakoti Date: Tue, 25 Nov 2025 09:39:24 +1100 Subject: [PATCH 2/5] run formatting --- src/strands_evals/dataset.py | 6 +++--- src/strands_evals/types/evaluation_report.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/strands_evals/dataset.py b/src/strands_evals/dataset.py index 6046878..61abc23 100644 --- a/src/strands_evals/dataset.py +++ b/src/strands_evals/dataset.py @@ -451,7 +451,7 @@ def to_file(self, path: str): ValueError: If the path has a non-JSON extension. """ file_path = Path(path) - + if file_path.suffix: if file_path.suffix != ".json": raise ValueError( @@ -524,7 +524,7 @@ def from_file(cls, path: str, custom_evaluators: list[type[Evaluator]] | None = ValueError: If the file does not have a .json extension. """ file_path = Path(path) - + if file_path.suffix != ".json": raise ValueError( f"Only .json format is supported. Got file: {path}. Please provide a path with .json extension." @@ -532,5 +532,5 @@ def from_file(cls, path: str, custom_evaluators: list[type[Evaluator]] | None = with open(file_path, "r") as f: data = json.load(f) - + return cls.from_dict(data, custom_evaluators) diff --git a/src/strands_evals/types/evaluation_report.py b/src/strands_evals/types/evaluation_report.py index 7ac8141..4f1cad4 100644 --- a/src/strands_evals/types/evaluation_report.py +++ b/src/strands_evals/types/evaluation_report.py @@ -202,7 +202,7 @@ def to_file(self, path: str): ValueError: If the path has a non-JSON extension. """ file_path = Path(path) - + if file_path.suffix: if file_path.suffix != ".json": raise ValueError( @@ -232,7 +232,7 @@ def from_file(cls, path: str): ValueError: If the file does not have a .json extension. """ file_path = Path(path) - + if file_path.suffix != ".json": raise ValueError( f"Only .json format is supported. Got file: {path}. Please provide a path with .json extension." @@ -240,5 +240,5 @@ def from_file(cls, path: str): with open(file_path, "r") as f: data = json.load(f) - + return cls.from_dict(data) From 8497aeaa3692b87ffbe9ba10dbef8ca2d317be5a Mon Sep 17 00:00:00 2001 From: Deepak Dalakoti Date: Tue, 25 Nov 2025 10:22:33 +1100 Subject: [PATCH 3/5] readme --- README.md | 47 +++++++++++++++++++ .../helpfulness_evaluator_langfuse.py | 2 +- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d813cae..aa97d06 100644 --- a/README.md +++ b/README.md @@ -213,6 +213,53 @@ dataset = await generator.generate_dataset( dataset.to_file("generated_math_research_dataset") ``` +### Evaluating Langfuse Traces + +```python +from langfuse_utils.utils import get_langfuse_trace + +from strands_evals import Case, Dataset +from strands_evals.evaluators import HelpfulnessEvaluator +from strands_evals.mappers.langfuse_mapper import ( + LangfuseSessionMapper, +) + +# 1. Define a task function +def user_task_function(case: Case) -> str: + # Initialise Langfuse mapper + mapper = LangfuseSessionMapper() + # Get trace id from case if available + trace_id = case.metadata.get("trace_id", "") if case.metadata else "" + # Fetch traces from langfuse + langfuse_traces = get_langfuse_trace(trace_id=trace_id) + # Map to session + session = mapper.map_to_session(langfuse_traces, trace_id) + agent_response = mapper.get_agent_final_response(session) + + return {"output": str(agent_response), "trajectory": session} + +# 2. Define test cases +test_cases = [ + Case[str, str]( + name="knowledge-1", + input="What is the status of my package", + metadata={"category": "knowledge", "trace_id": "xxxxxxx"}, + ), +] + + +# 3. Create an evaluator +evaluator = HelpfulnessEvaluator() + +# 4. Create a dataset +dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator) + +# 5. Run evaluations +report = dataset.run_evaluations(user_task_function) +report.run_display() +``` + + ## Available Evaluators - **OutputEvaluator**: Evaluates the quality and correctness of agent outputs diff --git a/src/examples/helpfulness_evaluator_langfuse.py b/src/examples/helpfulness_evaluator_langfuse.py index 5050d43..269f97d 100644 --- a/src/examples/helpfulness_evaluator_langfuse.py +++ b/src/examples/helpfulness_evaluator_langfuse.py @@ -21,7 +21,7 @@ def user_task_function(case: Case) -> str: return {"output": str(agent_response), "trajectory": session} - +# 2. Define test cases test_cases = [ Case[str, str]( name="knowledge-1", From 12278a792f668c672a4ad116e5f180f86ac15a54 Mon Sep 17 00:00:00 2001 From: Deepak Dalakoti Date: Tue, 25 Nov 2025 10:42:36 +1100 Subject: [PATCH 4/5] add more tests --- .../helpfulness_evaluator_langfuse.py | 1 + .../mappers/test_langfuse_mapper.py | 184 ++++++++++++++++++ 2 files changed, 185 insertions(+) diff --git a/src/examples/helpfulness_evaluator_langfuse.py b/src/examples/helpfulness_evaluator_langfuse.py index 269f97d..5c320d5 100644 --- a/src/examples/helpfulness_evaluator_langfuse.py +++ b/src/examples/helpfulness_evaluator_langfuse.py @@ -21,6 +21,7 @@ def user_task_function(case: Case) -> str: return {"output": str(agent_response), "trajectory": session} + # 2. Define test cases test_cases = [ Case[str, str]( diff --git a/tests/strands_evals/mappers/test_langfuse_mapper.py b/tests/strands_evals/mappers/test_langfuse_mapper.py index 9daca59..a2a7d3a 100644 --- a/tests/strands_evals/mappers/test_langfuse_mapper.py +++ b/tests/strands_evals/mappers/test_langfuse_mapper.py @@ -177,3 +177,187 @@ def test_invalid_json_handling(): assert isinstance(inference, InferenceSpan) assert len(inference.messages) == 1 assert inference.messages[0].content[0].text == "" + + +def test_parent_child_relationships(): + parent_obs = create_mock_observation( + "parent", "chat", "chat", input_data=[{"role": "user", "content": '[{"text": "parent"}]'}] + ) + child_obs = create_mock_observation( + "child", "chat", "chat", input_data=[{"role": "user", "content": '[{"text": "child"}]'}] + ) + child_obs.parent_observation_id = "parent" + + trace = create_mock_trace("trace1", "session1", [parent_obs, child_obs]) + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + assert session.traces[0].spans[1].span_info.parent_span_id == "parent" + + +def test_mixed_content_in_message(): + obs = create_mock_observation( + "obs1", + "chat", + "chat", + input_data=[ + { + "role": "assistant", + "content": """[{"text": "Let me calculate"}, + {"toolUse": {"toolUseId": "t1", "name": "calc", "input": {}}}]""", + } + ], + ) + + trace = create_mock_trace("trace1", "session1", [obs]) + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + msg = session.traces[0].spans[0].messages[0] + assert len(msg.content) == 2 + assert msg.content[0].text == "Let me calculate" + assert msg.content[1].name == "calc" + + +def test_tool_result_in_output(): + obs = create_mock_observation( + "obs1", + "chat", + "chat", + output={"tool.result": '[{"toolResult": {"toolUseId": "t1", "content": [{"text": "done"}]}}]'}, + ) + + trace = create_mock_trace("trace1", "session1", [obs]) + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + msg = session.traces[0].spans[0].messages[0] + assert msg.content[0].content == "done" + + +def test_failed_trace_conversion(): + valid_obs = create_mock_observation( + "obs1", "chat", "chat", input_data=[{"role": "user", "content": '[{"text": "hi"}]'}] + ) + invalid_obs = create_mock_observation("obs2", "chat", "chat") + invalid_obs.start_time = None # Will cause error + + valid_trace = create_mock_trace("trace1", "session1", [valid_obs]) + invalid_trace = create_mock_trace("trace2", "session1", [invalid_obs]) + + session = LangfuseSessionMapper().map_to_session([valid_trace, invalid_trace], "session1") + + assert len(session.traces) == 1 + + +def test_empty_content_arrays(): + obs = create_mock_observation("obs1", "chat", "chat", input_data=[{"role": "user", "content": "[]"}]) + + trace = create_mock_trace("trace1", "session1", [obs]) + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + # Empty content results in no messages, so span is filtered out + assert len(session.traces) == 0 + + +def test_agent_response_string_format(): + obs = create_mock_observation("obs1", "invoke_agent", "invoke_agent", output="string response") + + trace = create_mock_trace("trace1", "session1", [obs]) + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + agent = session.traces[0].spans[0] + assert agent.agent_response == "string response" + + +def test_multiple_agent_invocations(): + obs1 = create_mock_observation("obs1", "invoke_agent", "invoke_agent", output={"message": "first"}) + obs2 = create_mock_observation("obs2", "invoke_agent", "invoke_agent", output={"message": "second"}) + + trace = create_mock_trace("trace1", "session1", [obs1, obs2]) + mapper = LangfuseSessionMapper() + session = mapper.map_to_session([trace], "session1") + + assert mapper.get_agent_final_response(session) == "second" + + +def test_tool_execution_with_invalid_arguments(): + obs = create_mock_observation( + "obs1", "execute_tool", "execute_tool", input_data=[{"role": "tool", "content": "invalid"}] + ) + + trace = create_mock_trace("trace1", "session1", [obs]) + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + tool = session.traces[0].spans[0] + assert tool.tool_call.arguments == {} + + +def test_end_time_none(): + obs = create_mock_observation( + "obs1", "chat", "chat", input_data=[{"role": "user", "content": '[{"text": "test"}]'}] + ) + obs.end_time = None + + trace = create_mock_trace("trace1", "session1", [obs]) + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + span_info = session.traces[0].spans[0].span_info + assert span_info.end_time == span_info.start_time + + +def test_no_metadata(): + obs = create_mock_observation( + "obs1", "chat", "chat", input_data=[{"role": "user", "content": '[{"text": "test"}]'}] + ) + obs.metadata = None + + trace = create_mock_trace("trace1", "session1", [obs]) + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + # No metadata means no operation name, so span is filtered out + assert len(session.traces) == 0 + + +def test_session_with_no_valid_traces(): + obs = create_mock_observation("obs1", "unknown", "unknown") + + trace = create_mock_trace("trace1", "session1", [obs]) + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + assert len(session.traces) == 0 + + +def test_convert_trace_to_session_function(): + from strands_evals.mappers.langfuse_mapper import convert_trace_to_session + + obs = create_mock_observation( + "obs1", "chat", "chat", input_data=[{"role": "user", "content": '[{"text": "test"}]'}] + ) + trace = create_mock_trace("trace1", "session1", [obs]) + + session = convert_trace_to_session(trace) + + assert session.session_id == "session1" + assert len(session.traces) == 1 + + +def test_complex_tool_result_content(): + obs = create_mock_observation( + "obs1", + "chat", + "chat", + input_data=[{"role": "tool", "content": '[{"toolResult": {"toolUseId": "t1", "content": ["text_result"]}}]'}], + ) + + trace = create_mock_trace("trace1", "session1", [obs]) + session = LangfuseSessionMapper().map_to_session([trace], "session1") + + msg = session.traces[0].spans[0].messages[0] + assert msg.content[0].content == "text_result" + + +def test_no_session_id_uses_trace_id(): + obs = create_mock_observation("obs1", "chat", "chat", input_data=[{"role": "user", "content": '[{"text": "hi"}]'}]) + trace = create_mock_trace("trace1", None, [obs]) + + session = LangfuseSessionMapper().map_to_session([trace], "trace1") + + assert session.traces[0].session_id == "trace1" From 36528e5f95e808de4a8fcc41b7b3dd96174e6325 Mon Sep 17 00:00:00 2001 From: Deepak Dalakoti Date: Tue, 25 Nov 2025 11:04:36 +1100 Subject: [PATCH 5/5] remove comment --- src/examples/goal_success_rate_evaluator_langfuse.py | 4 ---- src/examples/helpfulness_evaluator_langfuse.py | 4 ---- 2 files changed, 8 deletions(-) diff --git a/src/examples/goal_success_rate_evaluator_langfuse.py b/src/examples/goal_success_rate_evaluator_langfuse.py index 2c2aca5..b821128 100644 --- a/src/examples/goal_success_rate_evaluator_langfuse.py +++ b/src/examples/goal_success_rate_evaluator_langfuse.py @@ -4,10 +4,6 @@ from strands_evals.evaluators import GoalSuccessRateEvaluator from strands_evals.mappers.langfuse_mapper import LangfuseSessionMapper -# ====================================== -# SETUP AND RUN STRANDS EVAL -# ====================================== - # 1. Define a task function def user_task_function(case: Case) -> dict: diff --git a/src/examples/helpfulness_evaluator_langfuse.py b/src/examples/helpfulness_evaluator_langfuse.py index 5c320d5..f813be9 100644 --- a/src/examples/helpfulness_evaluator_langfuse.py +++ b/src/examples/helpfulness_evaluator_langfuse.py @@ -6,10 +6,6 @@ LangfuseSessionMapper, ) -# ====================================== -# SETUP AND RUN STRANDS EVAL -# ====================================== - # 1. Define a task function def user_task_function(case: Case) -> str: