diff --git a/01-tutorials/07-AgentCore-E2E/lab-07-agent-evaluation.ipynb b/01-tutorials/07-AgentCore-E2E/lab-07-agent-evaluation.ipynb new file mode 100644 index 00000000..8458afcf --- /dev/null +++ b/01-tutorials/07-AgentCore-E2E/lab-07-agent-evaluation.ipynb @@ -0,0 +1,811 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Customer Support Agent Evaluation Framework\n", + "\n", + "## Overview\n", + "\n", + "This notebook implements the [Strands Agents evaluation strategy](https://strandsagents.com/0.1.x/documentation/docs/user-guide/observability-evaluation/evaluation/?h=evaluation) with LLM Judge Evaluation to comprehensively assess your customer support agent running on Amazon Bedrock AgentCore Runtime.\n", + "\n", + "### Evaluation Strategy\n", + "\n", + "- **Multi-dimensional Quality Assessment**: Helpfulness, accuracy, clarity, professionalism, completeness\n", + "- **Tool Usage Analysis**: Appropriate tool selection and usage patterns\n", + "- **Performance Metrics**: Response times and success rates\n", + "- **LLM-as-Judge**: Claude 4 Sonnet for objective evaluation with Tool Usage tracked using x-ray and observability\n", + "- **Summarize test results**\n", + "- **Save outputs to a file**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup and Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install boto3 requests strands-agents bedrock-agentcore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import asyncio\n", + "import json\n", + "import time\n", + "import uuid\n", + "import boto3\n", + "import requests\n", + "from dataclasses import dataclass, asdict\n", + "from typing import Dict, List, Any, Optional\n", + "from datetime import datetime\n", + "\n", + "\n", + "# AWS clients\n", + "bedrock = boto3.client('bedrock-runtime')\n", + "ssm = boto3.client('ssm')\n", + "\n", + "print(\"✅ Dependencies loaded successfully\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration\n", + "AGENT_NAME = \"customer_support_agent\" \n", + "EVALUATOR_MODEL = \"us.anthropic.claude-sonnet-4-20250514-v1:0\"\n", + "\n", + "# Helper function to get SSM parameters\n", + "def get_ssm_parameter(name: str) -> str:\n", + " try:\n", + " response = ssm.get_parameter(Name=name, WithDecryption=True)\n", + " return response['Parameter']['Value']\n", + " except Exception as e:\n", + " raise Exception(f\"Failed to get SSM parameter {name}: {e}\")\n", + "\n", + "# Get agent endpoint\n", + "try:\n", + " AGENT_ENDPOINT = get_ssm_parameter(f\"/app/customersupport/agentcore/runtime_arn\") # This is the default value set in lab-04-agentcore-runtime.ipynb \n", + " # print(f\"✅ Agent endpoint: {AGENT_ENDPOINT}\")\n", + "except Exception as e:\n", + " print(f\"⚠️ Could not get agent endpoint: {e}\")\n", + " AGENT_ENDPOINT = \"http://localhost:8080\" # Fallback for local testing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Classes and Test Cases" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class TestCase:\n", + " id: str\n", + " query: str\n", + " category: str\n", + " expected_tools: List[str]\n", + " expected_criteria: Dict[str, Any]\n", + " description: str\n", + "\n", + "@dataclass\n", + "class EvaluationResult:\n", + " test_case_id: str\n", + " query: str\n", + " response: str\n", + " metrics: Dict[str, float]\n", + " response_time: float\n", + " success: bool\n", + " error_message: Optional[str] = None\n", + " tool_calls: List[str] = None\n", + " \n", + " def to_dict(self):\n", + " return asdict(self)\n", + "\n", + "# Test cases\n", + "TEST_CASES = [\n", + " TestCase(\n", + " id=\"basic_greeting\",\n", + " query=\"Hi, I need help with my account\",\n", + " category=\"basic_inquiry\",\n", + " expected_tools=[],\n", + " expected_criteria={\"should_be_polite\": True, \"should_ask_for_details\": True},\n", + " description=\"Basic greeting and help request\"\n", + " ),\n", + " TestCase(\n", + " id=\"return_policy_check\",\n", + " query=\"What is your return policy for electronics?\",\n", + " category=\"policy_inquiry\",\n", + " expected_tools=[\"get_return_policy\"],\n", + " expected_criteria={\"should_provide_policy\": True, \"should_be_clear\": True},\n", + " description=\"Return policy information request\"\n", + " ),\n", + " TestCase(\n", + " id=\"product_info_request\",\n", + " query=\"Can you tell me about the Gaming Console Pro specifications?\",\n", + " category=\"product_inquiry\",\n", + " expected_tools=[\"get_product_info\"],\n", + " expected_criteria={\"should_provide_specs\": True, \"should_be_detailed\": True},\n", + " description=\"Product information request\"\n", + " ),\n", + " TestCase(\n", + " id=\"troubleshooting_general\",\n", + " query=\"I have overheating issues with my device, help me debug\",\n", + " category=\"technical_support\",\n", + " expected_tools=[],\n", + " expected_criteria={\"should_ask_device_details\": True, \"should_provide_steps\": True},\n", + " description=\"Technical troubleshooting without specialized tools\"\n", + " )\n", + "]\n", + "\n", + "print(f\"✅ Loaded {len(TEST_CASES)} test cases\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Agent Invocation and Evaluation Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize AgentCore Runtime client\n", + "from bedrock_agentcore_starter_toolkit import Runtime\n", + "from lab_helpers.utils import reauthenticate_user, get_existing_cognito_config\n", + "import os\n", + "import time\n", + "import uuid \n", + "from typing import Dict, List, Any, Optional\n", + "import boto3\n", + "# Enable verbose logging for requests\n", + "import logging\n", + "## Set the below config parameters if you want to see detailed logs from the agent\n", + "#logging.basicConfig(level=logging.DEBUG)\n", + "#logging.getLogger(\"urllib3.connectionpool\").setLevel(logging.DEBUG)\n", + "import urllib.parse\n", + "\n", + "session_id = uuid.uuid4()\n", + "agentcore_client = boto3.client(\n", + " 'bedrock-agentcore',\n", + " )\n", + "REGION_NAME=\"us-east-1\"\n", + "cognito_config = get_existing_cognito_config()\n", + "\n", + "async def invoke_agent(query: str, actor_id: str = \"testuser\") -> Dict[str, Any]:\n", + " \"\"\"Invoke the agent using AgentCore Runtime SDK with JWT token\"\"\"\n", + " start_time = time.time()\n", + " \n", + " # Get bearer token\n", + " bearer_token = reauthenticate_user(\n", + " cognito_config.get(\"client_id\"), \n", + " cognito_config.get(\"client_secret\")\n", + " )\n", + " \n", + " #print(f\"Using Agent ARN from environment: {AGENT_ENDPOINT}\")\n", + "\n", + " # URL encode the agent ARN\n", + " escaped_agent_arn = urllib.parse.quote(AGENT_ENDPOINT, safe='')\n", + "\n", + " # Construct the URL\n", + " url = f\"https://bedrock-agentcore.{REGION_NAME}.amazonaws.com/runtimes/{escaped_agent_arn}/invocations?qualifier=DEFAULT\"\n", + "\n", + " # Set up headers\n", + " headers = {\n", + " \"Authorization\": f\"Bearer {bearer_token}\", #f\"Bearer {auth_token}\",\n", + " \"X-Amzn-Trace-Id\": \"1234\", \n", + " \"Content-Type\": \"application/json\",\n", + " \"X-Amzn-Bedrock-AgentCore-Runtime-Session-Id\": f\"eval-session-{uuid.uuid4()}\"\n", + " }\n", + " invoke_response = requests.post(\n", + " url,\n", + " headers=headers,\n", + " data=json.dumps({\"prompt\": query})\n", + " )\n", + "\n", + " # Print response in a safe manner\n", + " print(f\"Status Code: {invoke_response.status_code}\")\n", + " print(f\"Response Headers: {dict(invoke_response.headers)}\")\n", + "\n", + " if invoke_response.status_code == 200:\n", + " try:\n", + " response_data = invoke_response.json()\n", + " print(\"Response JSON:\", response_data)\n", + " \n", + " # Handle both string and dict responses\n", + " if isinstance(response_data, str):\n", + " response_text = response_data\n", + " elif isinstance(response_data, dict):\n", + " response_text = response_data.get(\"result\", str(response_data))\n", + " else:\n", + " response_text = str(response_data)\n", + " \n", + " return {\n", + " \"response\": response_text,\n", + " \"success\": True,\n", + " \"tool_calls\": extract_tool_calls(invoke_response, response_text), # Pass response object\n", + " \"response_time\": time.time() - start_time\n", + " }\n", + " except json.JSONDecodeError:\n", + " return {\n", + " \"response\": invoke_response.text,\n", + " \"success\": True,\n", + " \"tool_calls\": [],\n", + " \"response_time\": time.time() - start_time\n", + " }\n", + " else:\n", + " error_msg = f\"Error ({invoke_response.status_code}): {invoke_response.text[:500]}\"\n", + " print(error_msg)\n", + " if invoke_response.status_code >= 400:\n", + " print(\"Please ensure lab-04-agentcore-runtime.ipynb has been executed and agent is deployed\")\n", + " return {\n", + " \"response\": error_msg,\n", + " \"success\": False,\n", + " \"tool_calls\": [],\n", + " \"response_time\": time.time() - start_time\n", + " }\n", + " \n", + " \n", + "\"\"\"\n", + "AgentCore observability tool extraction using gen_ai.tool.name and tool.status.\n", + "\"\"\"\n", + "\n", + "\n", + "def extract_tool_calls_from_agentcore_observability(response_obj, response_text: str = \"\") -> List[str]:\n", + " \"\"\"Extract tool calls using AgentCore observability gen_ai.tool.name and tool.status.\"\"\"\n", + " \n", + " # Extract session ID from headers\n", + " session_id = None\n", + " if hasattr(response_obj, 'headers'):\n", + " headers = dict(response_obj.headers)\n", + " session_id = headers.get('X-Amzn-Bedrock-AgentCore-Runtime-Session-Id')\n", + " if not session_id and 'baggage' in headers:\n", + " baggage = headers['baggage']\n", + " if 'session.id=' in baggage:\n", + " session_id = baggage.split('session.id=')[1].split(',')[0]\n", + " \n", + " tools = []\n", + " \n", + " # Query X-Ray for gen_ai.tool.name spans\n", + " if session_id:\n", + " try:\n", + " xray_client = boto3.client('xray')\n", + " \n", + " # Get traces with gen_ai annotations\n", + " response = xray_client.get_trace_summaries(\n", + " TimeRangeType='Service',\n", + " StartTime=time.time() - 300,\n", + " EndTime=time.time(),\n", + " ServiceName='bedrock-agentcore'\n", + " )\n", + " \n", + " for trace_summary in response.get('TraceSummaries', []):\n", + " trace_response = xray_client.batch_get_traces(TraceIds=[trace_summary['Id']])\n", + " \n", + " for trace in trace_response.get('Traces', []):\n", + " for segment in trace.get('Segments', []):\n", + " segment_doc = json.loads(segment['Document'])\n", + " \n", + " # Check for gen_ai.tool.name in annotations\n", + " annotations = segment_doc.get('annotations', {})\n", + " if 'gen_ai.tool.name' in annotations:\n", + " tool_name = annotations['gen_ai.tool.name']\n", + " tool_status = annotations.get('tool.status', 'success')\n", + " \n", + " # Only include successful tool calls\n", + " if tool_status in ['success', 'completed']:\n", + " tools.append(tool_name)\n", + " \n", + " # Check subsegments\n", + " for subsegment in segment_doc.get('subsegments', []):\n", + " sub_annotations = subsegment.get('annotations', {})\n", + " if 'gen_ai.tool.name' in sub_annotations:\n", + " tool_name = sub_annotations['gen_ai.tool.name']\n", + " tool_status = sub_annotations.get('tool.status', 'success')\n", + " \n", + " if tool_status in ['success', 'completed']:\n", + " tools.append(tool_name)\n", + " \n", + " except Exception as e:\n", + " print(f\"X-Ray observability extraction failed: {e}\")\n", + " \n", + " # Fallback to content analysis if no observability data\n", + " if not tools and response_text:\n", + " if any(phrase in response_text.lower() for phrase in [\n", + " \"return policy\", \"30-day return\", \"refunds typically process\"\n", + " ]):\n", + " tools.append(\"get_return_policy\")\n", + " \n", + " if any(phrase in response_text.lower() for phrase in [\n", + " \"gaming console pro\", \"specifications\", \"technical details\"\n", + " ]):\n", + " tools.append(\"get_product_info\")\n", + " \n", + " return list(set(tools))\n", + " \n", + "\n", + "def extract_tool_calls(response_obj, response_text: str = \"\") -> List[str]:\n", + " \"\"\"Extract tool calls using AgentCore gen_ai.tool.name observability.\"\"\"\n", + " \n", + " # Extract session ID from headers\n", + " session_id = None\n", + " if hasattr(response_obj, 'headers'):\n", + " headers = dict(response_obj.headers)\n", + " session_id = headers.get('X-Amzn-Bedrock-AgentCore-Runtime-Session-Id')\n", + " \n", + " tools = []\n", + " \n", + " # Query X-Ray for gen_ai.tool.name spans\n", + " if session_id:\n", + " try:\n", + " xray_client = boto3.client('xray')\n", + " response = xray_client.get_trace_summaries(\n", + " TimeRangeType='Service',\n", + " StartTime=time.time() - 300,\n", + " EndTime=time.time(),\n", + " ServiceName='bedrock-agentcore'\n", + " )\n", + " \n", + " for trace_summary in response.get('TraceSummaries', []):\n", + " trace_response = xray_client.batch_get_traces(TraceIds=[trace_summary['Id']])\n", + " \n", + " for trace in trace_response.get('Traces', []):\n", + " for segment in trace.get('Segments', []):\n", + " segment_doc = json.loads(segment['Document'])\n", + " \n", + " # Check annotations for gen_ai.tool.name\n", + " annotations = segment_doc.get('annotations', {})\n", + " if 'gen_ai.tool.name' in annotations and annotations.get('tool.status') == 'success':\n", + " tools.append(annotations['gen_ai.tool.name'])\n", + " \n", + " except Exception:\n", + " pass\n", + " \n", + " # Fallback to content analysis\n", + " if not tools and response_text:\n", + " if \"return policy\" in response_text.lower():\n", + " tools.append(\"get_return_policy\")\n", + " if \"gaming console pro\" in response_text.lower():\n", + " tools.append(\"get_product_info\")\n", + " \n", + " return list(set(tools))\n", + "\n", + "\n", + "print(\"✅ Agent invocation functions defined\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cognito_config # Ensure the output matches what you see from lab-04" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def evaluate_response_quality(query: str, response: str, criteria: Dict[str, Any]) -> Dict[str, float]:\n", + " \"\"\"Evaluate response quality using Claude as judge\"\"\"\n", + " \n", + " evaluation_prompt = f\"\"\"\n", + " You are an expert evaluator for customer support AI agents. Evaluate the following response on a scale of 1-5 for each metric.\n", + "\n", + " Customer Query: {query}\n", + " Agent Response: {response}\n", + "\n", + " Evaluate on these metrics (1=Poor, 2=Below Average, 3=Average, 4=Good, 5=Excellent):\n", + "\n", + " 1. HELPFULNESS: Does the response address the customer's needs and provide useful information?\n", + " 2. ACCURACY: Is the information provided factually correct and reliable?\n", + " 3. CLARITY: Is the response clear, well-structured, and easy to understand?\n", + " 4. PROFESSIONALISM: Does the response maintain appropriate tone and professionalism?\n", + " 5. COMPLETENESS: Does the response fully address all aspects of the query?\n", + "\n", + " Expected criteria: {json.dumps(criteria, indent=2)}\n", + "\n", + " Respond with ONLY a JSON object in this format:\n", + " {{\n", + " \"helpfulness\": ,\n", + " \"accuracy\": ,\n", + " \"clarity\": ,\n", + " \"professionalism\": ,\n", + " \"completeness\": ,\n", + " \"reasoning\": \"Brief explanation of scores\"\n", + " }}\n", + " \"\"\"\n", + " \n", + " try:\n", + " response_obj = bedrock.invoke_model(\n", + " modelId=EVALUATOR_MODEL,\n", + " body=json.dumps({\n", + " \"anthropic_version\": \"bedrock-2023-05-31\",\n", + " \"max_tokens\": 1000,\n", + " \"messages\": [\n", + " {\"role\": \"user\", \"content\": evaluation_prompt}\n", + " ]\n", + " })\n", + " )\n", + " \n", + " result = json.loads(response_obj['body'].read())\n", + " content = result['content'][0]['text']\n", + " \n", + " # Extract JSON from response\n", + " start_idx = content.find('{')\n", + " end_idx = content.rfind('}') + 1\n", + " json_str = content[start_idx:end_idx]\n", + " \n", + " scores = json.loads(json_str)\n", + " return {k: v for k, v in scores.items() if k != \"reasoning\"}\n", + " \n", + " except Exception as e:\n", + " print(f\"Error in quality evaluation: {e}\")\n", + " return {\n", + " \"helpfulness\": 0.0,\n", + " \"accuracy\": 0.0,\n", + " \"clarity\": 0.0,\n", + " \"professionalism\": 0.0,\n", + " \"completeness\": 0.0\n", + " }\n", + "\n", + "def evaluate_tool_usage(expected_tools: List[str], actual_tools: List[str]) -> float:\n", + " \"\"\"Evaluate tool usage effectiveness\"\"\"\n", + " if not expected_tools:\n", + " return 5.0 if not actual_tools else 3.0\n", + " \n", + " if not actual_tools:\n", + " print(f\"Expected tools {expected_tools}, while actual tools {actual_tools}\")\n", + " return 0.0 # Return 0 if tools expected but none called\n", + " \n", + " expected_set = set(expected_tools)\n", + " actual_set = set(actual_tools)\n", + " \n", + " precision = len(expected_set.intersection(actual_set)) / len(actual_set) if actual_set else 0\n", + " recall = len(expected_set.intersection(actual_set)) / len(expected_set) if expected_set else 0\n", + " \n", + " f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0\n", + " return f1 * 5 # Scale to 0-5\n", + "\n", + "print(\"✅ Evaluation functions defined\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Single Test Case Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def evaluate_test_case(test_case: TestCase) -> EvaluationResult:\n", + " \"\"\"Evaluate a single test case\"\"\"\n", + " print(f\"🔍 Evaluating: {test_case.id} - {test_case.description}\")\n", + " \n", + " # Invoke agent\n", + " agent_result = await invoke_agent(test_case.query)\n", + " \n", + " # Handle None result\n", + " if agent_result is None:\n", + " return EvaluationResult(\n", + " test_case_id=test_case.id,\n", + " query=test_case.query,\n", + " response=\"\",\n", + " metrics={},\n", + " response_time=0.0,\n", + " success=False,\n", + " error_message=\"invoke_agent returned None\"\n", + " )\n", + " \n", + " if not agent_result[\"success\"]:\n", + " return EvaluationResult(\n", + " test_case_id=test_case.id,\n", + " query=test_case.query,\n", + " response=\"\",\n", + " metrics={},\n", + " response_time=agent_result[\"response_time\"],\n", + " success=False,\n", + " error_message=agent_result[\"error_message\"]\n", + " )\n", + " \n", + " # Evaluate response quality\n", + " quality_scores = await evaluate_response_quality(\n", + " test_case.query,\n", + " agent_result[\"response\"],\n", + " test_case.expected_criteria\n", + " )\n", + " \n", + " # Evaluate tool usage\n", + " tool_score = evaluate_tool_usage(\n", + " test_case.expected_tools,\n", + " agent_result[\"tool_calls\"]\n", + " )\n", + " \n", + " # Combine all metrics\n", + " metrics = {\n", + " **quality_scores,\n", + " \"tool_usage\": tool_score,\n", + " \"response_time\": agent_result[\"response_time\"]\n", + " }\n", + " \n", + " return EvaluationResult(\n", + " test_case_id=test_case.id,\n", + " query=test_case.query,\n", + " response=agent_result[\"response\"],\n", + " metrics=metrics,\n", + " response_time=agent_result[\"response_time\"],\n", + " success=True,\n", + " tool_calls=agent_result[\"tool_calls\"]\n", + " )\n", + "\n", + "print(\"✅ Test case evaluation function defined\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Single Test Case (Demo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test a single case first\n", + "demo_test = TEST_CASES[0] # Basic greeting\n", + "demo_result = await evaluate_test_case(demo_test)\n", + "\n", + "print(f\"\\n📊 Demo Result for '{demo_test.id}':\")\n", + "print(f\"Query: {demo_result.query}\")\n", + "print(f\"Response: {demo_result.response[:200]}...\" if len(demo_result.response) > 200 else f\"Response: {demo_result.response}\")\n", + "print(f\"Response Time: {demo_result.response_time:.3f}s\")\n", + "print(f\"Tool Calls: {demo_result.tool_calls}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Full Evaluation Suite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def run_full_evaluation(test_cases: List[TestCase]) -> Dict[str, Any]:\n", + " \"\"\"Run evaluation on all test cases\"\"\"\n", + " print(f\"🚀 Starting evaluation of {len(test_cases)} test cases...\")\n", + " \n", + " results = []\n", + " for i, test_case in enumerate(test_cases, 1):\n", + " print(f\"\\n[{i}/{len(test_cases)}] Processing: {test_case.id}\")\n", + " result = await evaluate_test_case(test_case)\n", + " results.append(result)\n", + " \n", + " # Brief pause between tests\n", + " await asyncio.sleep(1)\n", + " \n", + " # Calculate summary statistics\n", + " summary = calculate_summary(results)\n", + " \n", + " return {\n", + " \"agent_name\": AGENT_NAME,\n", + " \"total_test_cases\": len(test_cases),\n", + " \"results\": [result.to_dict() for result in results],\n", + " \"summary\": summary,\n", + " \"timestamp\": datetime.now().isoformat()\n", + " }\n", + "\n", + "def calculate_summary(results: List[EvaluationResult]) -> Dict[str, Any]:\n", + " \"\"\"Calculate summary statistics\"\"\"\n", + " successful_results = [r for r in results if r.success]\n", + " \n", + " if not successful_results:\n", + " return {\"error\": \"No successful test cases\"}\n", + " \n", + " # Average scores\n", + " metrics = [\"helpfulness\", \"accuracy\", \"clarity\", \"professionalism\", \"completeness\", \"tool_usage\"]\n", + " avg_scores = {}\n", + " \n", + " for metric in metrics:\n", + " scores = [r.metrics.get(metric, 0) for r in successful_results if metric in r.metrics]\n", + " avg_scores[metric] = sum(scores) / len(scores) if scores else 0\n", + " \n", + " # Response time statistics\n", + " response_times = sorted([r.response_time for r in successful_results])\n", + " n = len(response_times)\n", + " \n", + " percentiles = {\n", + " \"p50\": response_times[n//2] if n > 0 else 0,\n", + " \"p90\": response_times[int(n*0.9)] if n > 0 else 0,\n", + " \"p95\": response_times[int(n*0.95)] if n > 0 else 0,\n", + " \"p99\": response_times[int(n*0.99)] if n > 0 else 0,\n", + " }\n", + " \n", + " return {\n", + " \"success_rate\": len(successful_results) / len(results),\n", + " \"average_scores\": avg_scores,\n", + " \"overall_score\": sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0,\n", + " \"response_time_percentiles\": percentiles,\n", + " \"total_successful\": len(successful_results),\n", + " \"total_failed\": len(results) - len(successful_results)\n", + " }\n", + "\n", + "def safe_score_calculation(metrics):\n", + " numeric_values = [v for k, v in metrics.items() \n", + " if k != 'response_time' and isinstance(v, (int, float))]\n", + " return sum(numeric_values) / len(numeric_values) if numeric_values else 0\n", + "\n", + "print(\"✅ Full evaluation functions defined\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run full evaluation\n", + "evaluation_results = await run_full_evaluation(TEST_CASES)\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"📊 EVALUATION COMPLETE\")\n", + "print(\"=\"*60)\n", + "\n", + "# Display LLM Judge scores for each test\n", + "print(\"\\n🤖 LLM JUDGE SCORES BY TEST CASE:\")\n", + "print(\"-\" * 60)\n", + "for result in evaluation_results.get(\"results\", []):\n", + " if result[\"success\"] and result.get(\"metrics\"):\n", + " metrics = result[\"metrics\"]\n", + " print(f\"\\n📝 {result['test_case_id'].upper()}:\")\n", + " print(f\" Helpfulness: {metrics.get('helpfulness', 0):.1f}/5.0\")\n", + " print(f\" Accuracy: {metrics.get('accuracy', 0):.1f}/5.0\")\n", + " print(f\" Clarity: {metrics.get('clarity', 0):.1f}/5.0\")\n", + " print(f\" Professionalism: {metrics.get('professionalism', 0):.1f}/5.0\")\n", + " print(f\" Completeness: {metrics.get('completeness', 0):.1f}/5.0\")\n", + " print(f\" Tool Usage: {metrics.get('tool_usage', 0):.1f}/5.0\")\n", + " avg_score = safe_score_calculation(metrics)\n", + " else:\n", + " print(f\"\\n❌ {result['test_case_id'].upper()}: FAILED\")\n", + " print(f\" Error: {result.get('error_message', 'Unknown error')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Results Analysis and Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def print_detailed_summary(results: dict):\n", + " \"\"\"Print comprehensive evaluation summary\"\"\"\n", + " summary = results.get(\"summary\", {})\n", + " \n", + " print(f\"🤖 Agent: {results['agent_name']}\")\n", + " print(f\"📝 Total Test Cases: {results['total_test_cases']}\")\n", + " print(f\"✅ Success Rate: {summary.get('success_rate', 0):.1%}\")\n", + " print(f\"🎯 Overall Score: {summary.get('overall_score', 0):.2f}/5.0\")\n", + " \n", + " print(\"\\n📈 QUALITY METRICS (1-5 scale):\")\n", + " avg_scores = summary.get(\"average_scores\", {})\n", + " for metric, score in avg_scores.items():\n", + " if metric != \"response_time\":\n", + " emoji = \"🟢\" if score >= 4.0 else \"🟡\" if score >= 3.0 else \"🔴\"\n", + " print(f\" {emoji} {metric.title()}: {score:.2f}\")\n", + " \n", + " print(\"\\n⏱️ RESPONSE TIME PERCENTILES:\")\n", + " percentiles = summary.get(\"response_time_percentiles\", {})\n", + " for p, time_val in percentiles.items():\n", + " print(f\" {p.upper()}: {time_val:.3f}s\")\n", + " \n", + " print(\"\\n📋 DETAILED RESULTS:\")\n", + " for result in results.get(\"results\", []):\n", + " status = \"✅\" if result[\"success\"] else \"❌\"\n", + " if result[\"success\"] and result.get(\"metrics\"):\n", + " score = safe_score_calculation(metrics)#sum(result[\"metrics\"].values()) / len(result[\"metrics\"])\n", + " print(f\" {status} {result['test_case_id']}: {score:.2f}/5.0 ({result['response_time']:.3f}s)\")\n", + " else:\n", + " print(f\" {status} {result['test_case_id']}: FAILED - {result.get('error_message', 'Unknown error')}\")\n", + "\n", + "# Print detailed summary\n", + "print_detailed_summary(evaluation_results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save results to file\n", + "output_file = f\"evaluation_results_{AGENT_NAME}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json\"\n", + "\n", + "with open(output_file, 'w') as f:\n", + " json.dump(evaluation_results, f, indent=2)\n", + "\n", + "print(f\"💾 Results saved to: {output_file}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".parent_venv (3.12.4)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/01-tutorials/07-AgentCore-E2E/lab_helpers/utils.py b/01-tutorials/07-AgentCore-E2E/lab_helpers/utils.py index 4d687288..e3596c9b 100644 --- a/01-tutorials/07-AgentCore-E2E/lab_helpers/utils.py +++ b/01-tutorials/07-AgentCore-E2E/lab_helpers/utils.py @@ -333,6 +333,17 @@ def cleanup_cognito_resources(pool_id): return False +def get_existing_cognito_config(): + """Get the existing Cognito configuration used in lab-04""" + try: + secret_value = get_customer_support_secret() + if secret_value: + return json.loads(secret_value) + return None + except Exception as e: + print(f"Error getting existing Cognito config: {e}") + return None + def reauthenticate_user(client_id, client_secret): boto_session = Session() region = boto_session.region_name