diff --git a/pipeline.py b/pipeline.py index 4b58fad..ddc5bee 100644 --- a/pipeline.py +++ b/pipeline.py @@ -63,10 +63,10 @@ def main(): "--timeout", type=int, default=3600, help="Timeout in seconds for agent execution" ) parser.add_argument( - "--stream", - action="store_true", - default=False, - help="Use streaming execution (default: False, uses non-streaming)", + "--reasoning-effort", + default="default", + choices=["default", "minimal", "low", "medium", "high"], + help="Reasoning effort level for supported models (default: None)", ) # Output configuration @@ -138,7 +138,7 @@ def main(): timeout=args.timeout, exp_name=run_exp_name, output_dir=run_output_dir, - stream=args.stream, + reasoning_effort=args.reasoning_effort, ) pipeline.run_evaluation(args.tasks) diff --git a/src/agents/__init__.py b/src/agents/__init__.py new file mode 100644 index 0000000..ccbf714 --- /dev/null +++ b/src/agents/__init__.py @@ -0,0 +1,11 @@ +""" +MCPMark Agent Module +==================== + +Provides a unified agent implementation using LiteLLM for model interactions +and minimal MCP server management. +""" + +from .mcpmark_agent import MCPMarkAgent + +__all__ = ["MCPMarkAgent"] \ No newline at end of file diff --git a/src/agents/mcp/__init__.py b/src/agents/mcp/__init__.py new file mode 100644 index 0000000..09824d3 --- /dev/null +++ b/src/agents/mcp/__init__.py @@ -0,0 +1,11 @@ +""" +MCP (Model Context Protocol) Components +======================================== + +Minimal MCP server implementations for MCPMark. +""" + +from .stdio_server import MCPStdioServer +from .http_server import MCPHttpServer + +__all__ = ["MCPStdioServer", "MCPHttpServer"] \ No newline at end of file diff --git a/src/agents/mcp/http_server.py b/src/agents/mcp/http_server.py new file mode 100644 index 0000000..9c0cc36 --- /dev/null +++ b/src/agents/mcp/http_server.py @@ -0,0 +1,78 @@ +""" +Minimal MCP HTTP Server Implementation +======================================= + +Provides HTTP-based MCP server communication for services like GitHub. +""" + +import asyncio +from contextlib import AsyncExitStack +from typing import Any, Dict, List, Optional + +from mcp import ClientSession +from mcp.client.streamable_http import streamablehttp_client + +class MCPHttpServer: + """ + HTTP-based MCP client using the official MCP Python SDK + (Streamable HTTP transport). + """ + + def __init__( + self, + url: str, + headers: Optional[Dict[str, str]] = None, + timeout: int = 30, + ): + self.url = url.rstrip("/") + self.headers = headers or {} + self.timeout = timeout + + self._stack: Optional[AsyncExitStack] = None + self.session: Optional[ClientSession] = None + self._tools_cache: Optional[List[Dict[str, Any]]] = None + + async def __aenter__(self): + await self.start() + return self + + async def __aexit__(self, exc_type, exc, tb): + await self.stop() + + async def start(self): + """Open Streamable HTTP transport and initialize MCP session.""" + self._stack = AsyncExitStack() + + read_stream, write_stream, _ = await self._stack.enter_async_context( + streamablehttp_client(self.url, headers=self.headers) + ) + + self.session = await self._stack.enter_async_context(ClientSession(read_stream, write_stream)) + await asyncio.wait_for(self.session.initialize(), timeout=self.timeout) + + async def stop(self): + """Close the session/transport cleanly.""" + if self._stack: + await self._stack.aclose() + self._stack = None + self.session = None + self._tools_cache = None + + async def list_tools(self) -> List[Dict[str, Any]]: + """Return tool definitions (cached).""" + if self._tools_cache is not None: + return self._tools_cache + if not self.session: + raise RuntimeError("MCP HTTP client not started") + + resp = await asyncio.wait_for(self.session.list_tools(), timeout=self.timeout) + self._tools_cache = [t.model_dump() for t in resp.tools] + return self._tools_cache + + async def call_tool(self, name: str, arguments: Dict[str, Any]) -> Any: + """Invoke a remote tool and return the structured result.""" + if not self.session: + raise RuntimeError("MCP HTTP client not started") + + result = await asyncio.wait_for(self.session.call_tool(name, arguments), timeout=self.timeout) + return result.model_dump() diff --git a/src/agents/mcp/stdio_server.py b/src/agents/mcp/stdio_server.py new file mode 100644 index 0000000..80368fd --- /dev/null +++ b/src/agents/mcp/stdio_server.py @@ -0,0 +1,46 @@ +""" +Minimal MCP Stdio Server Implementation +======================================== + +Provides stdio-based MCP server communication for services like +Notion, Filesystem, Playwright, and Postgres. +""" + +import asyncio +import os +from contextlib import AsyncExitStack +from typing import Any, Dict, List, Optional + +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client + +class MCPStdioServer: + """Lightweight wrapper around the official MCP Python SDK.""" + + def __init__(self, command: str, args: List[str], env: Optional[Dict[str, str]] = None, timeout: int = 120): + self.params = StdioServerParameters(command=command, args=args, env={**os.environ, **(env or {})}) + self.timeout = timeout + self._stack: Optional[AsyncExitStack] = None + self._streams = None + self.session: Optional[ClientSession] = None + + async def __aenter__(self): + self._stack = AsyncExitStack() + read, write = await self._stack.enter_async_context(stdio_client(self.params)) + self.session = await self._stack.enter_async_context(ClientSession(read, write)) + await asyncio.wait_for(self.session.initialize(), timeout=self.timeout) + return self + + async def __aexit__(self, exc_type, exc, tb): + if self._stack: + await self._stack.aclose() + self._stack = None + self.session = None + + async def list_tools(self) -> List[Dict[str, Any]]: + resp = await asyncio.wait_for(self.session.list_tools(), timeout=self.timeout) + return [t.model_dump() for t in resp.tools] + + async def call_tool(self, name: str, arguments: Dict[str, Any]) -> Any: + result = await asyncio.wait_for(self.session.call_tool(name, arguments), timeout=self.timeout) + return result.model_dump() # 同上,转成 dict diff --git a/src/agents/mcpmark_agent.py b/src/agents/mcpmark_agent.py new file mode 100644 index 0000000..a6f729c --- /dev/null +++ b/src/agents/mcpmark_agent.py @@ -0,0 +1,818 @@ +""" +MCPMark Agent Implementation +============================ + +Unified agent using LiteLLM for all model interactions with minimal MCP support. +""" + +import asyncio +import json +import time +import uuid + +from typing import Any, Dict, List, Optional, Callable + +import litellm +import nest_asyncio + +from src.logger import get_logger +from .mcp import MCPStdioServer, MCPHttpServer +from .utils import TokenUsageTracker + +# Apply nested asyncio support +nest_asyncio.apply() + +# Configure LiteLLM +litellm.suppress_debug_info = True + +logger = get_logger(__name__) + +class MCPMarkAgent: + """ + Unified agent for LLM and MCP server management using LiteLLM. + + - Anthropic models: Native MCP support via extra_body + - Other models: Manual MCP server management with function calling + """ + + # Constants + MAX_TURNS = 100 + DEFAULT_TIMEOUT = 600 + + # Service categories + STDIO_SERVICES = ["notion", "filesystem", "playwright", "playwright_webarena", "postgres"] + HTTP_SERVICES = ["github"] + + def __init__( + self, + litellm_input_model_name: str, + api_key: str, + base_url: str, + mcp_service: str, + timeout: int = DEFAULT_TIMEOUT, + service_config: Optional[Dict[str, Any]] = None, + service_config_provider: Optional[Callable[[], Dict]] = None, + reasoning_effort: Optional[str] = "default", + ): + """ + Initialize the MCPMark agent. + + Args: + model_name: Name of the LLM model + api_key: API key for the model provider + base_url: Base url + mcp_service: MCP service type + timeout: Execution timeout in seconds + service_config: Service-specific configuration + service_config_provider: Optional provider for dynamic config + reasoning_effort: Reasoning effort level ("default", "minimal", "low", "medium", "high") + """ + self.litellm_input_model_name = litellm_input_model_name + self.api_key = api_key + self.base_url = base_url + self.mcp_service = mcp_service + self.timeout = timeout + self.service_config = service_config or {} + self._service_config_provider = service_config_provider + self.reasoning_effort = reasoning_effort + + # Detect if this is an Anthropic model with HTTP MCP service + self.is_anthropic = self._is_anthropic_model(litellm_input_model_name) + self.use_anthropic_native = self.is_anthropic and mcp_service in self.HTTP_SERVICES + + # Initialize usage tracker + self.usage_tracker = TokenUsageTracker() + + # Track the actual model name from responses + self.litellm_run_model_name = None + + logger.debug( + f"Initialized MCPMarkAgent for '{mcp_service}' with model '{litellm_input_model_name}' " + f"(Anthropic Native MCP: {self.use_anthropic_native}, Reasoning: {reasoning_effort})" + ) + + def _is_anthropic_model(self, model_name: str) -> bool: + """Check if the model is an Anthropic model.""" + return "claude" in model_name.lower() + + + def _refresh_service_config(self): + """Refresh service config from provider if available.""" + if self._service_config_provider: + try: + latest_cfg = self._service_config_provider() or {} + self.service_config.update(latest_cfg) + except Exception as e: + logger.warning(f"| Failed to refresh service config: {e}") + + async def execute( + self, + instruction: str, + tool_call_log_file: Optional[str] = None + ) -> Dict[str, Any]: + """ + Execute instruction with the agent. + + Args: + instruction: The instruction/prompt to execute + tool_call_log_file: Optional path to log tool calls + + Returns: + Dictionary containing execution results + """ + start_time = time.time() + + try: + # Refresh service configuration + self._refresh_service_config() + + # Execute with timeout control + async def _execute_with_strategy(): + if self.use_anthropic_native: + # Use native MCP support only for Anthropic + HTTP MCP services (e.g., GitHub) + return await self._execute_anthropic_native( + instruction, tool_call_log_file + ) + else: + # Use manual MCP management for all other cases + # This includes: non-Anthropic models, or Anthropic with STDIO services + return await self._execute_with_manual_mcp( + instruction, tool_call_log_file + ) + + # Apply timeout to the entire execution + result = await asyncio.wait_for( + _execute_with_strategy(), + timeout=self.timeout + ) + + execution_time = time.time() - start_time + + # Update usage statistics + self.usage_tracker.update( + success=result["success"], + token_usage=result.get("token_usage", {}), + turn_count=result.get("turn_count", 0), + execution_time=execution_time + ) + + result["execution_time"] = execution_time + return result + + except asyncio.TimeoutError: + execution_time = time.time() - start_time + error_msg = f"Execution timed out after {self.timeout} seconds" + logger.error(error_msg) + + self.usage_tracker.update( + success=False, + token_usage={}, + turn_count=0, + execution_time=execution_time + ) + + return { + "success": False, + "output": [], + "token_usage": {}, + "turn_count": 0, + "execution_time": execution_time, + "error": error_msg + } + + except Exception as e: + execution_time = time.time() - start_time + error_msg = f"Agent execution failed: {e}" + logger.error(error_msg, exc_info=True) + + self.usage_tracker.update( + success=False, + token_usage={}, + turn_count=0, + execution_time=execution_time + ) + + return { + "success": False, + "output": [], + "token_usage": {}, + "turn_count": 0, + "execution_time": execution_time, + "error": str(e) + } + + async def _execute_anthropic_native( + self, + instruction: str, + tool_call_log_file: Optional[str] = None + ) -> Dict[str, Any]: + """ + Execute using Anthropic's native MCP support for HTTP-based services. + Only used for Claude models with GitHub MCP service. + """ + logger.debug("Using Anthropic native MCP execution for HTTP service") + + # Get MCP configuration for Anthropic + mcp_config = self._get_anthropic_mcp_config() + + # Prepare messages + messages = [{"role": "user", "content": instruction}] + + # Prepare extra headers and body for Anthropic + extra_headers = { + "anthropic-version": "2023-06-01", + "anthropic-beta": "mcp-client-2025-04-04", + } + + extra_body = { + "mcp_servers": [mcp_config], + } + + try: + return await self._get_anthropic_response( + messages, extra_headers, extra_body, tool_call_log_file + ) + except Exception as e: + # Fallback: return minimal failure result with user message captured + logger.error(f"Anthropic native path failed: {e}", exc_info=True) + sdk_format_messages = self._convert_to_sdk_format(messages) + return { + "success": False, + "output": sdk_format_messages, + "token_usage": {}, + "turn_count": 0, + "error": str(e), + "litellm_run_model_name": self.litellm_run_model_name, + } + + async def _get_anthropic_response( + self, + messages: List[Dict], + extra_headers: Dict, + extra_body: Dict, + tool_call_log_file: Optional[str] = None + ) -> Dict[str, Any]: + """Get non-streaming response from Anthropic.""" + try: + # Build completion kwargs + completion_kwargs = { + "model": self.litellm_input_model_name, + "messages": messages, + "api_key": self.api_key, + "extra_headers": extra_headers, + "extra_body": extra_body, + } + + # Add reasoning_effort if specified + if self.reasoning_effort != "default": + completion_kwargs["reasoning_effort"] = self.reasoning_effort + + response = await litellm.acompletion(**completion_kwargs) + + # Extract actual model name from response + if hasattr(response, 'model') and response.model: + self.litellm_run_model_name = response.model + + # Extract response content + content = response.choices[0].message.content if response.choices else "" + + # Extract token usage including reasoning tokens + token_usage = {} + if hasattr(response, 'usage') and response.usage: + token_usage = { + "input_tokens": response.usage.prompt_tokens or 0, + "output_tokens": response.usage.completion_tokens or 0, + "total_tokens": response.usage.total_tokens or 0 + } + + # Extract reasoning tokens if available + if hasattr(response.usage, 'completion_tokens_details'): + details = response.usage.completion_tokens_details + if hasattr(details, 'reasoning_tokens'): + token_usage["reasoning_tokens"] = details.reasoning_tokens or 0 + + # Log to file if specified + if tool_call_log_file and content: + with open(tool_call_log_file, 'a', encoding='utf-8') as f: + f.write(content + "\n") + + # Display token usage + if token_usage: + log_msg = ( + f"\n| Token usage: Total: {token_usage['total_tokens']:,} | " + f"Input: {token_usage['input_tokens']:,} | " + f"Output: {token_usage['output_tokens']:,}" + ) + if "reasoning_tokens" in token_usage: + log_msg += f" | Reasoning: {token_usage['reasoning_tokens']:,}" + logger.info(log_msg) + + # Convert to SDK format for backward compatibility + messages_with_response = messages + [{"role": "assistant", "content": content}] + sdk_format_messages = self._convert_to_sdk_format(messages_with_response) + + return { + "success": True, + "output": sdk_format_messages, + "token_usage": token_usage, + "turn_count": 1, + "error": None, + "litellm_run_model_name": self.litellm_run_model_name + } + + except Exception as e: + logger.error(f"Anthropic execution failed: {e}") + raise + + async def _execute_with_manual_mcp( + self, + instruction: str, + tool_call_log_file: Optional[str] = None + ) -> Dict[str, Any]: + """ + Execute with manual MCP server management. + Used for all non-Anthropic models and Anthropic models with STDIO services. + """ + logger.debug("Using manual MCP execution with function calling loop") + + # Create and start MCP server + mcp_server = await self._create_mcp_server() + + try: + async with mcp_server: + # Get available tools + tools = await mcp_server.list_tools() + + # Convert MCP tools to OpenAI function format + functions = self._convert_mcp_tools_to_functions(tools) + + # Execute with function calling loop + return await self._execute_function_calling_loop( + instruction, functions, mcp_server, tool_call_log_file + ) + + except Exception as e: + logger.error(f"Manual MCP execution failed: {e}") + raise + + def _convert_to_sdk_format(self, messages: List[Dict]) -> List[Dict]: + """Convert OpenAI messages format to old SDK format for backward compatibility.""" + sdk_format = [] + function_call_map = {} # Track function names to call IDs for legacy format + + for msg in messages: + role = msg.get("role") + + if role == "user": + # User messages stay mostly the same + sdk_format.append({ + "content": msg.get("content", ""), + "role": "user" + }) + + elif role == "assistant": + # === CHANGED ORDER START === + tool_calls = msg.get("tool_calls", []) + function_call = msg.get("function_call") + content = msg.get("content") + + # 1) First add assistant's text content (if present) + if content: + sdk_format.append({ + "id": "__fake_id__", + "content": [ + { + "annotations": [], + "text": content if content else "", + "type": "output_text" + } + ], + "role": "assistant", + "status": "completed", + "type": "message" + }) + + # 2) Then add (new format) tool_calls + if tool_calls: + for tool_call in tool_calls: + call_id = tool_call.get("id", f"call_{uuid.uuid4().hex}") + func_name = tool_call.get("function", {}).get("name", "") + sdk_format.append({ + "arguments": tool_call.get("function", {}).get("arguments", "{}"), + "call_id": call_id, + "name": func_name, + "type": "function_call", + "id": "__fake_id__" + }) + + # 3) Finally handle (legacy format) function_call + if function_call: + func_name = function_call.get("name", "") + call_id = f"call_{uuid.uuid4().hex}" + function_call_map[func_name] = call_id # Store for matching responses + sdk_format.append({ + "arguments": function_call.get("arguments", "{}"), + "call_id": call_id, + "name": func_name, + "type": "function_call", + "id": "__fake_id__" + }) + + # 4) If neither content nor any calls exist, maintain fallback behavior + if not content and not tool_calls and not function_call: + sdk_format.append({ + "id": "__fake_id__", + "content": [ + { + "annotations": [], + "text": "", + "type": "output_text" + } + ], + "role": "assistant", + "status": "completed", + "type": "message" + }) + # === CHANGED ORDER END === + + elif role == "tool": + # Tool responses + sdk_format.append({ + "call_id": msg.get("tool_call_id", ""), + "output": json.dumps({ + "type": "text", + "text": msg.get("content", ""), + "annotations": None, + "meta": None + }), + "type": "function_call_output" + }) + + elif role == "function": + # Legacy function responses - try to match with stored call ID + func_name = msg.get("name", "") + call_id = function_call_map.get(func_name, f"call_{uuid.uuid4().hex}") + sdk_format.append({ + "call_id": call_id, + "output": json.dumps({ + "type": "text", + "text": msg.get("content", ""), + "annotations": None, + "meta": None + }), + "type": "function_call_output" + }) + + return sdk_format + + + async def _execute_function_calling_loop( + self, + instruction: str, + functions: List[Dict], + mcp_server: Any, + tool_call_log_file: Optional[str] = None + ) -> Dict[str, Any]: + """Execute function calling loop with LiteLLM.""" + messages = [ + {"role": "system", "content": "You are a helpful agent that uses tools iteratively to complete the user's task, and when finished, provides the final answer or simply states \"Task completed\" without further tool calls."}, + {"role": "user", "content": instruction} + ] + total_tokens = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0, "reasoning_tokens": 0} + turn_count = 0 + max_turns = self.MAX_TURNS # Limit turns to prevent infinite loops + consecutive_failures = 0 + max_consecutive_failures = 3 + + # Convert functions to tools format for newer models + tools = [{"type": "function", "function": func} for func in functions] if functions else None + + try: + while turn_count < max_turns: + + # Build completion kwargs + completion_kwargs = { + "model": self.litellm_input_model_name, + "messages": messages, + "api_key": self.api_key, + } + + # Always use tools format if available - LiteLLM will handle conversion + if tools: + completion_kwargs["tools"] = tools + completion_kwargs["tool_choice"] = "auto" + + # Add reasoning_effort and base_url if specified + if self.reasoning_effort != "default": + completion_kwargs["reasoning_effort"] = self.reasoning_effort + if self.base_url: + completion_kwargs["base_url"] = self.base_url + + try: + # Call LiteLLM with timeout for individual call + response = await asyncio.wait_for( + litellm.acompletion(**completion_kwargs), + timeout = self.timeout / 2 # Use half of total timeout + ) + consecutive_failures = 0 # Reset failure counter on success + except asyncio.TimeoutError: + logger.warning(f"| ✗ LLM call timed out on turn {turn_count + 1}") + consecutive_failures += 1 + if consecutive_failures >= max_consecutive_failures: + raise Exception(f"Too many consecutive failures ({consecutive_failures})") + await asyncio.sleep(8 ** consecutive_failures) # Exponential backoff + continue + except Exception as e: + logger.error(f"| ✗ LLM call failed on turn {turn_count + 1}: {e}") + consecutive_failures += 1 + if consecutive_failures >= max_consecutive_failures: + raise + await asyncio.sleep(8 ** consecutive_failures) # Exponential backoff + continue + + # Extract actual model name from response (first turn only) + if turn_count == 0 and hasattr(response, 'model') and response.model: + self.litellm_run_model_name = response.model.split("/")[-1] + + # Update token usage including reasoning tokens + if hasattr(response, 'usage') and response.usage: + total_tokens["input_tokens"] += response.usage.prompt_tokens or 0 + total_tokens["output_tokens"] += response.usage.completion_tokens or 0 + total_tokens["total_tokens"] += response.usage.total_tokens or 0 + + # Extract reasoning tokens if available + if hasattr(response.usage, 'completion_tokens_details'): + details = response.usage.completion_tokens_details + if hasattr(details, 'reasoning_tokens'): + total_tokens["reasoning_tokens"] += details.reasoning_tokens or 0 + + # Get response message + choices = response.choices + if len(choices): + message = choices[0].message + else: + break + + # Convert to dict (prefer model_dump over deprecated dict()) + message_dict = message.model_dump() if hasattr(message, 'model_dump') else dict(message) + + # Log assistant's text content if present + if hasattr(message, 'content') and message.content: + # Display the content with line prefix + for line in message.content.splitlines(): + logger.info(f"| {line}") + + # Also log to file if specified + if tool_call_log_file: + with open(tool_call_log_file, 'a', encoding='utf-8') as f: + f.write(f"{message.content}\n") + + # Check for tool calls (newer format) + if hasattr(message, 'tool_calls') and message.tool_calls: + messages.append(message_dict) + turn_count += 1 + # Process tool calls + for tool_call in message.tool_calls: + func_name = tool_call.function.name + func_args = json.loads(tool_call.function.arguments) + + try: + result = await asyncio.wait_for( + mcp_server.call_tool(func_name, func_args), + timeout=30 + ) + messages.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "content": json.dumps(result) + }) + except asyncio.TimeoutError: + error_msg = f"Tool call '{func_name}' timed out after 30 seconds" + logger.error(error_msg) + messages.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "content": f"Error: {error_msg}" + }) + except Exception as e: + logger.error(f"Tool call failed: {e}") + messages.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "content": f"Error: {str(e)}" + }) + + # Format arguments for display (truncate if too long) + args_str = json.dumps(func_args, separators=(",", ": ")) + display_arguments = args_str[:140] + "..." if len(args_str) > 140 else args_str + + # Log with ANSI color codes (bold tool name, dim gray arguments) + logger.info(f"| \033[1m{func_name}\033[0m \033[2;37m{display_arguments}\033[0m") + + if tool_call_log_file: + with open(tool_call_log_file, 'a', encoding='utf-8') as f: + f.write(f"| {func_name} {args_str}\n") + continue + else: + # No tool/function call, add message and we're done + messages.append(message_dict) + turn_count += 1 + break + except Exception as loop_error: + # On any error, return partial conversation, token usage, and turn count + logger.error(f"Manual MCP loop failed: {loop_error}", exc_info=True) + sdk_format_messages = self._convert_to_sdk_format(messages) + return { + "success": False, + "output": sdk_format_messages, + "token_usage": total_tokens, + "turn_count": turn_count, + "error": str(loop_error), + "litellm_run_model_name": self.litellm_run_model_name, + } + + # Display final token usage + if total_tokens["total_tokens"] > 0: + log_msg = ( + f"|\n| Token usage: Total: {total_tokens['total_tokens']:,} | " + f"Input: {total_tokens['input_tokens']:,} | " + f"Output: {total_tokens['output_tokens']:,}" + ) + if total_tokens.get("reasoning_tokens", 0) > 0: + log_msg += f" | Reasoning: {total_tokens['reasoning_tokens']:,}" + logger.info(log_msg) + logger.info(f"| Turns: {turn_count}") + + # Convert messages to SDK format for backward compatibility + sdk_format_messages = self._convert_to_sdk_format(messages) + + return { + "success": True, + "output": sdk_format_messages, + "token_usage": total_tokens, + "turn_count": turn_count, + "error": None, + "litellm_run_model_name": self.litellm_run_model_name + } + + def _convert_mcp_tools_to_functions(self, tools: List[Dict]) -> List[Dict]: + """Convert MCP tool definitions to OpenAI function format.""" + functions = [] + + for tool in tools: + function = { + "name": tool.get("name"), + "description": tool.get("description", ""), + "parameters": tool.get("inputSchema", { + "type": "object", + "properties": {}, + "required": [] + }) + } + functions.append(function) + + return functions + + + def _get_anthropic_mcp_config(self) -> Dict[str, Any]: + """Get MCP configuration for Anthropic native support.""" + if self.mcp_service == "github": + return { + "type": "url", + "url": "https://api.githubcopilot.com/mcp/", + "name": "github", + "authorization_token": self.service_config.get("github_token", "") + } + else: + # For stdio-based services, Anthropic expects a different format + # This would need to be implemented based on Anthropic's requirements + raise NotImplementedError( + f"Anthropic native MCP for {self.mcp_service} not yet implemented" + ) + + async def _create_mcp_server(self) -> Any: + """Create and return an MCP server instance.""" + if self.mcp_service in self.STDIO_SERVICES: + return self._create_stdio_server() + elif self.mcp_service in self.HTTP_SERVICES: + return self._create_http_server() + else: + raise ValueError(f"Unsupported MCP service: {self.mcp_service}") + + def _create_stdio_server(self) -> MCPStdioServer: + """Create stdio-based MCP server.""" + if self.mcp_service == "notion": + notion_key = self.service_config.get("notion_key") + if not notion_key: + raise ValueError("Notion API key required") + + return MCPStdioServer( + command="npx", + args=["-y", "@notionhq/notion-mcp-server"], + env={ + "OPENAPI_MCP_HEADERS": ( + '{"Authorization": "Bearer ' + notion_key + '", ' + '"Notion-Version": "2022-06-28"}' + ) + } + ) + + elif self.mcp_service == "filesystem": + test_directory = self.service_config.get("test_directory") + if not test_directory: + raise ValueError("Test directory required for filesystem service") + + return MCPStdioServer( + command="npx", + args=["-y", "@modelcontextprotocol/server-filesystem", str(test_directory)] + ) + + elif self.mcp_service in ["playwright", "playwright_webarena"]: + browser = self.service_config.get("browser", "chromium") + headless = self.service_config.get("headless", True) + viewport_width = self.service_config.get("viewport_width", 1280) + viewport_height = self.service_config.get("viewport_height", 720) + + args = ["-y", "@playwright/mcp@latest"] + if headless: + args.append("--headless") + args.extend([ + "--isolated", + "--no-sandbox", + "--browser", browser, + "--viewport-size", f"{viewport_width},{viewport_height}" + ]) + + return MCPStdioServer(command="npx", args=args) + + elif self.mcp_service == "postgres": + host = self.service_config.get("host", "localhost") + port = self.service_config.get("port", 5432) + username = self.service_config.get("username") + password = self.service_config.get("password") + database = self.service_config.get("current_database") or self.service_config.get("database") + + if not all([username, password, database]): + raise ValueError("PostgreSQL requires username, password, and database") + + database_url = f"postgresql://{username}:{password}@{host}:{port}/{database}" + + return MCPStdioServer( + command="pipx", + args=["run", "postgres-mcp", "--access-mode=unrestricted"], + env={"DATABASE_URI": database_url} + ) + + else: + raise ValueError(f"Unsupported stdio service: {self.mcp_service}") + + def _create_http_server(self) -> MCPHttpServer: + """Create HTTP-based MCP server.""" + if self.mcp_service == "github": + github_token = self.service_config.get("github_token") + if not github_token: + raise ValueError("GitHub token required") + + return MCPHttpServer( + url="https://api.githubcopilot.com/mcp/", + headers={ + "Authorization": f"Bearer {github_token}", + "User-Agent": "MCPMark/1.0" + } + ) + else: + raise ValueError(f"Unsupported HTTP service: {self.mcp_service}") + + def execute_sync( + self, + instruction: str, + tool_call_log_file: Optional[str] = None + ) -> Dict[str, Any]: + """ + Synchronous wrapper for execute method. + """ + try: + return asyncio.run(self.execute(instruction, tool_call_log_file)) + except asyncio.TimeoutError: + self.usage_tracker.update(False, {}, 0, self.timeout) + return { + "success": False, + "output": [], + "token_usage": {}, + "turn_count": 0, + "execution_time": self.timeout, + "error": f"Execution timed out after {self.timeout} seconds" + } + + def get_usage_stats(self) -> Dict[str, Any]: + """Get usage statistics.""" + return self.usage_tracker.get_stats() + + def reset_usage_stats(self): + """Reset usage statistics.""" + self.usage_tracker.reset() + + def __repr__(self): + return ( + f"MCPMarkAgent(service='{self.mcp_service}', model='{self.litellm_input_model_name}', " + ) \ No newline at end of file diff --git a/src/agents/utils/__init__.py b/src/agents/utils/__init__.py new file mode 100644 index 0000000..f5b2242 --- /dev/null +++ b/src/agents/utils/__init__.py @@ -0,0 +1,8 @@ +""" +Utility functions for MCPMark Agent +==================================== +""" + +from .token_usage import TokenUsageTracker + +__all__ = ["TokenUsageTracker"] \ No newline at end of file diff --git a/src/agents/utils/token_usage.py b/src/agents/utils/token_usage.py new file mode 100644 index 0000000..17ac0f4 --- /dev/null +++ b/src/agents/utils/token_usage.py @@ -0,0 +1,78 @@ +""" +Token Usage Tracking Utilities +=============================== +""" + +from typing import Dict, Any + + +class TokenUsageTracker: + """Track token usage across agent executions.""" + + def __init__(self): + """Initialize token usage tracker.""" + self.reset() + + def reset(self): + """Reset all usage statistics.""" + self._stats = { + "total_input_tokens": 0, + "total_output_tokens": 0, + "total_tokens": 0, + "total_turns": 0, + "total_execution_time": 0.0, + "successful_executions": 0, + "failed_executions": 0, + } + + def update(self, success: bool, token_usage: Dict[str, int], + turn_count: int, execution_time: float): + """ + Update usage statistics. + + Args: + success: Whether execution was successful + token_usage: Token usage dict with input_tokens, output_tokens, total_tokens + turn_count: Number of conversation turns + execution_time: Execution time in seconds + """ + if success: + self._stats["successful_executions"] += 1 + else: + self._stats["failed_executions"] += 1 + + self._stats["total_input_tokens"] += token_usage.get("input_tokens", 0) + self._stats["total_output_tokens"] += token_usage.get("output_tokens", 0) + self._stats["total_tokens"] += token_usage.get("total_tokens", 0) + self._stats["total_turns"] += turn_count + self._stats["total_execution_time"] += execution_time + + def get_stats(self) -> Dict[str, Any]: + """ + Get usage statistics with calculated averages. + + Returns: + Dictionary containing usage statistics + """ + stats = self._stats.copy() + + # Calculate averages + total_executions = stats["successful_executions"] + stats["failed_executions"] + if total_executions > 0: + stats["avg_input_tokens"] = stats["total_input_tokens"] / total_executions + stats["avg_output_tokens"] = stats["total_output_tokens"] / total_executions + stats["avg_total_tokens"] = stats["total_tokens"] / total_executions + stats["avg_turns"] = stats["total_turns"] / total_executions + stats["avg_execution_time"] = stats["total_execution_time"] / total_executions + stats["success_rate"] = (stats["successful_executions"] / total_executions * 100) + else: + stats.update({ + "avg_input_tokens": 0.0, + "avg_output_tokens": 0.0, + "avg_total_tokens": 0.0, + "avg_turns": 0.0, + "avg_execution_time": 0.0, + "success_rate": 0.0, + }) + + return stats \ No newline at end of file diff --git a/src/aggregators/aggregate_results.py b/src/aggregators/aggregate_results.py index c2e8344..1852404 100644 --- a/src/aggregators/aggregate_results.py +++ b/src/aggregators/aggregate_results.py @@ -127,7 +127,7 @@ def collect_task_results_from_run( "task_execution_time": meta.get("task_execution_time", 0), "token_usage": meta.get("token_usage", {}), "turn_count": meta.get("turn_count", 0), - "actual_model_name": meta.get("actual_model_name"), + "model_name_name": meta.get("model_name_name"), "meta": meta, # Keep full meta for model_results } except Exception as e: @@ -164,7 +164,7 @@ def calculate_k_run_metrics( "total_output_tokens": 0, "total_tokens": 0, "total_turns": 0, - "actual_model_name": None, + "model_name_name": None, } ) @@ -240,14 +240,14 @@ def calculate_k_run_metrics( service_model_metrics[service_model]["total_tokens"] += sum(task_total_tokens) service_model_metrics[service_model]["total_turns"] += sum(task_turns) - # Store actual_model_name from first available task result - if service_model_metrics[service_model]["actual_model_name"] is None: + # Store model_name_name from first available task result + if service_model_metrics[service_model]["model_name_name"] is None: for run_name in runs_to_process: if run_name in all_runs_results: run_results = all_runs_results[run_name] task_result = run_results.get(task_key) - if task_result and task_result.get("actual_model_name"): - service_model_metrics[service_model]["actual_model_name"] = task_result["actual_model_name"] + if task_result and task_result.get("model_name_name"): + service_model_metrics[service_model]["model_name_name"] = task_result["model_name_name"] break # pass@1: Will be calculated as avg@k later (skip individual task counting) @@ -372,7 +372,7 @@ def aggregate_single_run_results( total_output_tokens = 0 total_tokens = 0 total_turns = 0 - actual_model_name = None + model_name_name = None for task_dir in service_model_dir.iterdir(): if not task_dir.is_dir(): @@ -403,9 +403,9 @@ def aggregate_single_run_results( total_tokens += token_usage.get("total_tokens", 0) or 0 total_turns += meta.get("turn_count", 0) or 0 - # Store actual_model_name from first available meta - if actual_model_name is None and meta.get("actual_model_name"): - actual_model_name = meta.get("actual_model_name") + # Store model_name_name from first available meta + if model_name_name is None and meta.get("model_name_name"): + model_name_name = meta.get("model_name_name") except Exception as e: print(f"⚠️ Error reading {meta_path}: {e}") @@ -428,7 +428,7 @@ def aggregate_single_run_results( "avg_output_tokens": round(total_output_tokens / total_tasks, 4), "avg_total_tokens": round(total_tokens / total_tasks, 4), "avg_turns": round(total_turns / total_tasks, 4), - "actual_model_name": actual_model_name, + "model_name_name": model_name_name, } return service_model_results @@ -529,11 +529,11 @@ def create_simplified_summary( model_metrics["per_run_output_tokens"] = per_run_output_tokens model_metrics["per_run_cost"] = per_run_cost - # Set actual_model_name from first available service data + # Set model_name_name from first available service data for service_model, metrics in service_model_results.items(): if "__" in service_model and service_model.split("__", 1)[1] == model: - if metrics.get("actual_model_name"): - model_metrics["actual_model_name"] = metrics["actual_model_name"] + if metrics.get("model_name_name"): + model_metrics["model_name_name"] = metrics["model_name_name"] break if k > 1: @@ -697,8 +697,8 @@ def create_simplified_summary( "per_run_cost": per_run_cost, } - # Add actual_model_name to service-level metrics - formatted_metrics["actual_model_name"] = metrics.get("actual_model_name") + # Add model_name_name to service-level metrics + formatted_metrics["model_name_name"] = metrics.get("model_name_name") summary[service][model] = formatted_metrics return summary diff --git a/src/errors.py b/src/errors.py index ca2742a..36eb8c3 100644 --- a/src/errors.py +++ b/src/errors.py @@ -9,23 +9,26 @@ from typing import Optional -# Retryable error patterns +"""Retryable error detection via minimal substring matching (lower-case).""" + +# Keep this list short and generic; aim to catch API/infrastructure issues only. RETRYABLE_PATTERNS = { - "timeout", - "timed out", - "etimedout", - "econnrefused", - "connection refused", - "network error", + "ratelimit", # e.g., RateLimitError, too many requests + "timeout", # any timeout wording + "connection", # connection refused/reset/error + "unavailable", # service unavailable + "internal server error", # 500s + "network error", # generic network issue + "quota", # budget/quota exceeded + # pipeline infra signals "mcp network error", "state duplication error", - "already exists", } def is_retryable_error(error: str) -> bool: - """Check if an error message indicates it should be retried.""" - error_lower = str(error).lower() + """Return True if the error string contains any retryable pattern.""" + error_lower = str(error or "").lower() return any(pattern in error_lower for pattern in RETRYABLE_PATTERNS) @@ -57,8 +60,3 @@ def standardize_error_message(error: str, mcp_service: Optional[str] = None) -> return f"{mcp_service.title()} {base_msg}" return base_msg - - -def get_retry_delay(attempt: int, base_delay: int = 5) -> int: - """Get exponential backoff delay for retries.""" - return min(base_delay * (2 ** (attempt - 1)), 60) # Cap at 60 seconds diff --git a/src/evaluator.py b/src/evaluator.py index 819ad7f..aed77a2 100644 --- a/src/evaluator.py +++ b/src/evaluator.py @@ -10,12 +10,8 @@ from src.factory import MCPServiceFactory from src.model_config import ModelConfig from src.results_reporter import EvaluationReport, ResultsReporter, TaskResult -from src.agent import MCPAgent - -PIPELINE_RETRY_ERRORS: List[str] = [ - "State Duplication Error", - "MCP Network Error", -] +from src.errors import is_retryable_error +from src.agents import MCPMarkAgent # Initialize logger logger = get_logger(__name__) @@ -29,18 +25,23 @@ def __init__( timeout: int = 300, exp_name: str = "test-run", output_dir: Path = None, - stream: bool = False, + reasoning_effort: str = "default", ): # Main configuration self.mcp_service = mcp_service - self.model = model self.timeout = timeout - + # Initialize model configuration - model_config = ModelConfig(model) - self.actual_model_name = model_config.actual_model_name - self.base_url = model_config.base_url + self.reasoning_effort = reasoning_effort + self.model_name = model + + model_config = ModelConfig(self.model_name) self.api_key = model_config.api_key + self.base_url = model_config.base_url + self.litellm_input_model_name = model_config.litellm_input_model_name + + # Track the actual model name from LiteLLM responses + self.litellm_run_model_name = None # Initialize managers using the factory pattern (simplified) self.task_manager = MCPServiceFactory.create_task_manager(mcp_service) @@ -53,22 +54,25 @@ def __init__( # automatically refresh its service configuration from the state # manager before each execution, so per-task manual updates are no # longer needed. - self.agent = MCPAgent( - model_name=self.actual_model_name, + self.agent = MCPMarkAgent( + litellm_input_model_name=self.litellm_input_model_name, # Use the original model name for detection api_key=self.api_key, base_url=self.base_url, mcp_service=mcp_service, timeout=timeout, service_config=self.service_config, service_config_provider=self.state_manager.get_service_config_for_agent, - stream=stream, + reasoning_effort=self.reasoning_effort, ) # Initialize results reporter self.results_reporter = ResultsReporter() # Output directory handling - model_slug = self.model.replace(".", "-") + if self.reasoning_effort != "default": + model_slug = self.model_name.replace(".", "-") + "-" + self.reasoning_effort + else: + model_slug = self.model_name.replace(".", "-") self.base_experiment_dir = output_dir / exp_name / f"{mcp_service}__{model_slug}" self.base_experiment_dir.mkdir(parents=True, exist_ok=True) @@ -216,6 +220,10 @@ def _run_single_task(self, task) -> TaskResult: ) agent_execution_time = time.time() - agent_execution_start_time + + # Extract actual model name from LiteLLM response + if agent_result.get("litellm_run_model_name"): + self.litellm_run_model_name = agent_result["litellm_run_model_name"] # Write messages.json to task_output_dir messages_path = task_output_dir / "messages.json" @@ -284,7 +292,7 @@ def run_evaluation(self, task_filter: str) -> EvaluationReport: retry_due_to_error = ( existing_result is not None and not existing_result.success - and existing_result.error_message in PIPELINE_RETRY_ERRORS + and is_retryable_error(existing_result.error_message) ) if existing_result and not retry_due_to_error: @@ -334,7 +342,9 @@ def run_evaluation(self, task_filter: str) -> EvaluationReport: meta_path = task_output_dir / "meta.json" model_config = { "mcp_service": self.mcp_service, - "model_name": self.actual_model_name, + "model_name": self.model_name, + "litellm_run_model_name": self.litellm_run_model_name, + "reasoning_effort": self.reasoning_effort, "timeout": self.timeout, } self.results_reporter.save_meta_json( @@ -375,10 +385,12 @@ def _matches_filter(tr: TaskResult, flt: str) -> bool: final_results = list(merged.values()) aggregated_report = EvaluationReport( - model_name=self.model, + model_name=self.model_name, model_config={ "mcp_service": self.mcp_service, - "model_name": self.actual_model_name, + "model_name": self.model_name, + "litellm_run_model_name": self.litellm_run_model_name, + "reasoning_effort": self.reasoning_effort, "timeout": self.timeout, }, total_tasks=len(final_results), diff --git a/src/model_config.py b/src/model_config.py index b51b378..ff2f9b0 100644 --- a/src/model_config.py +++ b/src/model_config.py @@ -28,123 +28,130 @@ class ModelConfig: "gpt-4o": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", - "base_url_var": "OPENAI_BASE_URL", - "actual_model_name": "gpt-4o", + "litellm_input_model_name": "openai/gpt-4o", }, "gpt-4.1": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", - "base_url_var": "OPENAI_BASE_URL", - "actual_model_name": "gpt-4.1", + "litellm_input_model_name": "openai/gpt-4.1", }, "gpt-4.1-mini": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", - "base_url_var": "OPENAI_BASE_URL", - "actual_model_name": "gpt-4.1-mini", + "litellm_input_model_name": "openai/gpt-4.1-mini", }, "gpt-5": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", - "base_url_var": "OPENAI_BASE_URL", - "actual_model_name": "gpt-5", + "litellm_input_model_name": "openai/gpt-5", }, "gpt-5-mini": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", - "base_url_var": "OPENAI_BASE_URL", - "actual_model_name": "gpt-5-mini", + "litellm_input_model_name": "openai/gpt-5-mini", }, "gpt-5-nano": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", - "base_url_var": "OPENAI_BASE_URL", - "actual_model_name": "gpt-5-nano", + "litellm_input_model_name": "openai/gpt-5-nano", }, "o3": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", - "base_url_var": "OPENAI_BASE_URL", - "actual_model_name": "o3", + "litellm_input_model_name": "openai/o3", }, "o4-mini": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", - "base_url_var": "OPENAI_BASE_URL", - "actual_model_name": "o4-mini", + "litellm_input_model_name": "openai/o4-mini", + }, + "gpt-oss-120b": { + "provider": "openai", + "api_key_var": "OPENROUTER_API_KEY", + "litellm_input_model_name": "openrouter/openai/gpt-oss-120b", }, # DeepSeek models "deepseek-chat": { "provider": "deepseek", "api_key_var": "DEEPSEEK_API_KEY", - "base_url_var": "DEEPSEEK_BASE_URL", - "actual_model_name": "deepseek-chat", + "litellm_input_model_name": "deepseek/deepseek-chat", }, "deepseek-reasoner": { "provider": "deepseek", "api_key_var": "DEEPSEEK_API_KEY", - "base_url_var": "DEEPSEEK_BASE_URL", - "actual_model_name": "deepseek-reasoner", + "litellm_input_model_name": "deepseek/deepseek-reasoner", }, # Anthropic models - "claude-3-7-sonnet": { + "claude-3.7-sonnet": { "provider": "anthropic", "api_key_var": "ANTHROPIC_API_KEY", - "base_url_var": "ANTHROPIC_BASE_URL", - "actual_model_name": "claude-3-7-sonnet-20250219", + "litellm_input_model_name": "anthropic/claude-3-7-sonnet-20250219", }, - "claude-4-sonnet": { + "claude-sonnet-4": { "provider": "anthropic", "api_key_var": "ANTHROPIC_API_KEY", - "base_url_var": "ANTHROPIC_BASE_URL", - "actual_model_name": "claude-sonnet-4-20250514", + "litellm_input_model_name": "anthropic/claude-sonnet-4-20250514", }, - "claude-4-opus": { + "claude-opus-4": { "provider": "anthropic", "api_key_var": "ANTHROPIC_API_KEY", - "base_url_var": "ANTHROPIC_BASE_URL", - "actual_model_name": "claude-opus-4-20250514", + "litellm_input_model_name": "anthropic/claude-opus-4-20250514", }, - "claude-4.1-opus": { + "claude-opus-4.1": { "provider": "anthropic", "api_key_var": "ANTHROPIC_API_KEY", - "base_url_var": "ANTHROPIC_BASE_URL", - "actual_model_name": "claude-opus-4-1-20250805", + "litellm_input_model_name": "anthropic/claude-opus-4-1-20250805", }, # Google models "gemini-2.5-pro": { "provider": "google", "api_key_var": "GEMINI_API_KEY", - "base_url_var": "GEMINI_BASE_URL", - "actual_model_name": "gemini-2.5-pro", + "litellm_input_model_name": "gemini/gemini-2.5-pro", }, "gemini-2.5-flash": { "provider": "google", "api_key_var": "GEMINI_API_KEY", - "base_url_var": "GEMINI_BASE_URL", - "actual_model_name": "gemini-2.5-flash", + "litellm_input_model_name": "gemini/gemini-2.5-flash", }, # Moonshot models "k2": { "provider": "moonshot", "api_key_var": "MOONSHOT_API_KEY", - "base_url_var": "MOONSHOT_BASE_URL", - "actual_model_name": "kimi-k2-0711-preview", + "litellm_input_model_name": "moonshot/kimi-k2-0711-preview", + }, + "k2-turbo": { + "provider": "moonshot", + "api_key_var": "MOONSHOT_API_KEY", + "litellm_input_model_name": "moonshot/kimi-k2-turbo-preview", }, # Grok models "grok-4": { "provider": "xai", "api_key_var": "GROK_API_KEY", - "base_url_var": "GROK_BASE_URL", - "actual_model_name": "grok-4-0709", + "litellm_input_model_name": "xai/grok-4-0709", + }, + "grok-code-fast-1": { + "provider": "xai", + "api_key_var": "GROK_API_KEY", + "litellm_input_model_name": "xai/grok-code-fast-1", }, # Qwen models "qwen-3-coder": { "provider": "qwen", - "api_key_var": "QWEN_API_KEY", - "base_url_var": "QWEN_BASE_URL", - "actual_model_name": "qwen/qwen3-coder", + "api_key_var": "OPENROUTER_API_KEY", + "litellm_input_model_name": "openrouter/qwen/qwen3-coder", }, + "qwen-3-coder-plus": { + "provider": "qwen", + "api_key_var": "DASHSCOPE_API_KEY", + "litellm_input_model_name": "dashscope/qwen3-coder-plus", + }, + # Zhipu + "glm-4.5": { + "provider": "zhipu", + "api_key_var": "OPENROUTER_API_KEY", + "litellm_input_model_name": "openrouter/z-ai/glm-4.5", + } } def __init__(self, model_name: str): @@ -157,25 +164,22 @@ def __init__(self, model_name: str): Raises: ValueError: If the model is not supported or environment variables are missing. """ - self.model_name = model_name + self.short_model_name = model_name model_info = self._get_model_info(model_name) - # Load API key and base URL from environment variables + # Load API key, base URL and LiteLLM model name from environment variables + if "base_url_var" in model_info: + self.base_url = os.getenv(model_info["base_url_var"]) + else: + self.base_url = None + self.api_key = os.getenv(model_info["api_key_var"]) if not self.api_key: raise ValueError( f"Missing required environment variable: {model_info['api_key_var']}" ) - self.base_url = os.getenv(model_info["base_url_var"]) - if not self.base_url: - raise ValueError( - f"Missing required environment variable: {model_info['base_url_var']}" - ) - - # Store provider and the actual model name for the API - self.provider = model_info["provider"] - self.actual_model_name = model_info.get("actual_model_name", model_name) + self.litellm_input_model_name = model_info.get("litellm_input_model_name", model_name) def _get_model_info(self, model_name: str) -> Dict[str, str]: """ @@ -190,8 +194,7 @@ def _get_model_info(self, model_name: str) -> Dict[str, str]: return { "provider": "openai", "api_key_var": "OPENAI_API_KEY", - "base_url_var": "OPENAI_BASE_URL", - "actual_model_name": model_name, + "litellm_input_model_name": model_name, } return self.MODEL_CONFIGS[model_name] @@ -209,10 +212,8 @@ def main(): # Example: Create a model config for DeepSeek model_config = ModelConfig("deepseek-chat") logger.info("✅ DeepSeek model config created successfully.") - logger.info("Provider: %s", model_config.provider) - logger.info("Actual model name: %s", model_config.actual_model_name) + logger.info("Short model name: %s", model_config.short_model_name) logger.info("API key loaded: %s", bool(model_config.api_key)) - logger.info("Base URL: %s", model_config.base_url) except ValueError as e: logger.error("⚠️ Configuration error: %s", e) diff --git a/src/results_reporter.py b/src/results_reporter.py index 40ec5da..6c54136 100644 --- a/src/results_reporter.py +++ b/src/results_reporter.py @@ -102,6 +102,15 @@ def total_tokens(self) -> int: if result.token_usage: total += result.token_usage.get("total_tokens", 0) return total + + @property + def total_reasoning_tokens(self) -> int: + """Calculate total reasoning tokens across all tasks.""" + total = 0 + for result in self.task_results: + if result.token_usage: + total += result.token_usage.get("reasoning_tokens", 0) + return total @property def avg_input_tokens(self) -> float: @@ -123,6 +132,13 @@ def avg_total_tokens(self) -> float: if self.total_tasks == 0: return 0.0 return self.total_tokens / self.total_tasks + + @property + def avg_reasoning_tokens(self) -> float: + """Calculate average reasoning tokens per task.""" + if self.total_tasks == 0: + return 0.0 + return self.total_reasoning_tokens / self.total_tasks @property def total_task_execution_time(self) -> float: @@ -155,9 +171,11 @@ def get_category_stats(self) -> Dict[str, Dict[str, Any]]: "total_input_tokens": 0, "total_output_tokens": 0, "total_tokens": 0, + "total_reasoning_tokens": 0, "avg_input_tokens": 0.0, "avg_output_tokens": 0.0, "avg_total_tokens": 0.0, + "avg_reasoning_tokens": 0.0, "total_turns": 0, "avg_turns": 0.0, } @@ -179,6 +197,9 @@ def get_category_stats(self) -> Dict[str, Dict[str, Any]]: category_stats[category]["total_tokens"] += result.token_usage.get( "total_tokens", 0 ) + category_stats[category]["total_reasoning_tokens"] += result.token_usage.get( + "reasoning_tokens", 0 + ) # Accumulate turns if result.turn_count is not None: @@ -206,6 +227,7 @@ def get_category_stats(self) -> Dict[str, Dict[str, Any]]: stats["total_output_tokens"] / stats["total"] ) stats["avg_total_tokens"] = stats["total_tokens"] / stats["total"] + stats["avg_reasoning_tokens"] = stats["total_reasoning_tokens"] / stats["total"] stats["avg_turns"] = ( stats["total_turns"] / stats["total"] if stats["total"] > 0 else 0 @@ -241,7 +263,9 @@ def save_meta_json( meta_data = { "task_name": task_result.task_name, - "model": model_config.get("model_name", "unknown"), + "model_name": model_config.get("model_name", "unknown"), + "litellm_run_model_name": model_config.get("litellm_run_model_name"), + "reasoning_effort": model_config.get("reasoning_effort"), "mcp": model_config.get("mcp_service", "unknown"), "timeout": model_config.get("timeout", 300), "time": {"start": start_time.isoformat(), "end": end_time.isoformat()}, @@ -272,7 +296,8 @@ def save_model_summary(self, report: EvaluationReport, output_path: Path) -> Pat avg_turns = total_turns / report.total_tasks if report.total_tasks > 0 else 0 summary = { - "model": report.model_name, + "model_name": report.model_name, + "model_config": report.model_config, "total_tasks": report.total_tasks, "successful_tasks": report.successful_tasks, "failed_tasks": report.failed_tasks, @@ -289,9 +314,11 @@ def save_model_summary(self, report: EvaluationReport, output_path: Path) -> Pat "total_input_tokens": report.total_input_tokens, "total_output_tokens": report.total_output_tokens, "total_tokens": report.total_tokens, + "total_reasoning_tokens": report.total_reasoning_tokens, "avg_input_tokens": round(report.avg_input_tokens, 2), "avg_output_tokens": round(report.avg_output_tokens, 2), "avg_total_tokens": round(report.avg_total_tokens, 2), + "avg_reasoning_tokens": round(report.avg_reasoning_tokens, 2), }, "turn_usage": { "total_turns": total_turns, @@ -306,9 +333,11 @@ def save_model_summary(self, report: EvaluationReport, output_path: Path) -> Pat "total_input": stats["total_input_tokens"], "total_output": stats["total_output_tokens"], "total": stats["total_tokens"], + "total_reasoning": stats["total_reasoning_tokens"], "avg_input": round(stats["avg_input_tokens"], 2), "avg_output": round(stats["avg_output_tokens"], 2), "avg_total": round(stats["avg_total_tokens"], 2), + "avg_reasoning": round(stats["avg_reasoning_tokens"], 2), }, "turn_usage": { "total_turns": stats["total_turns"], diff --git a/tasks/notion/it_trouble_shooting_hub/verification_expired_update/meta.json b/tasks/notion/it_trouble_shooting_hub/verification_expired_update/meta.json index c09a53d..a4602df 100644 --- a/tasks/notion/it_trouble_shooting_hub/verification_expired_update/meta.json +++ b/tasks/notion/it_trouble_shooting_hub/verification_expired_update/meta.json @@ -21,6 +21,6 @@ "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5", - "stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101" + "stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub" } } \ No newline at end of file