diff --git a/pipeline.py b/pipeline.py
index 4b58fad..ddc5bee 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -63,10 +63,10 @@ def main():
         "--timeout", type=int, default=3600, help="Timeout in seconds for agent execution"
     )
     parser.add_argument(
-        "--stream",
-        action="store_true",
-        default=False,
-        help="Use streaming execution (default: False, uses non-streaming)",
+        "--reasoning-effort",
+        default="default",
+        choices=["default", "minimal", "low", "medium", "high"],
+        help="Reasoning effort level for supported models (default: None)",
     )
 
     # Output configuration
@@ -138,7 +138,7 @@ def main():
                 timeout=args.timeout,
                 exp_name=run_exp_name,
                 output_dir=run_output_dir,
-                stream=args.stream,
+                reasoning_effort=args.reasoning_effort,
             )
 
             pipeline.run_evaluation(args.tasks)
diff --git a/src/agents/__init__.py b/src/agents/__init__.py
new file mode 100644
index 0000000..ccbf714
--- /dev/null
+++ b/src/agents/__init__.py
@@ -0,0 +1,11 @@
+"""
+MCPMark Agent Module
+====================
+
+Provides a unified agent implementation using LiteLLM for model interactions
+and minimal MCP server management.
+"""
+
+from .mcpmark_agent import MCPMarkAgent
+
+__all__ = ["MCPMarkAgent"]
\ No newline at end of file
diff --git a/src/agents/mcp/__init__.py b/src/agents/mcp/__init__.py
new file mode 100644
index 0000000..09824d3
--- /dev/null
+++ b/src/agents/mcp/__init__.py
@@ -0,0 +1,11 @@
+"""
+MCP (Model Context Protocol) Components
+========================================
+
+Minimal MCP server implementations for MCPMark.
+"""
+
+from .stdio_server import MCPStdioServer
+from .http_server import MCPHttpServer
+
+__all__ = ["MCPStdioServer", "MCPHttpServer"]
\ No newline at end of file
diff --git a/src/agents/mcp/http_server.py b/src/agents/mcp/http_server.py
new file mode 100644
index 0000000..9c0cc36
--- /dev/null
+++ b/src/agents/mcp/http_server.py
@@ -0,0 +1,78 @@
+"""
+Minimal MCP HTTP Server Implementation  
+=======================================
+
+Provides HTTP-based MCP server communication for services like GitHub.
+"""
+
+import asyncio
+from contextlib import AsyncExitStack
+from typing import Any, Dict, List, Optional
+
+from mcp import ClientSession
+from mcp.client.streamable_http import streamablehttp_client
+
+class MCPHttpServer:
+    """
+    HTTP-based MCP client using the official MCP Python SDK
+    (Streamable HTTP transport).
+    """
+
+    def __init__(
+        self,
+        url: str,
+        headers: Optional[Dict[str, str]] = None,
+        timeout: int = 30,
+    ):
+        self.url = url.rstrip("/")
+        self.headers = headers or {}
+        self.timeout = timeout
+
+        self._stack: Optional[AsyncExitStack] = None
+        self.session: Optional[ClientSession] = None
+        self._tools_cache: Optional[List[Dict[str, Any]]] = None
+
+    async def __aenter__(self):
+        await self.start()
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb):
+        await self.stop()
+
+    async def start(self):
+        """Open Streamable HTTP transport and initialize MCP session."""
+        self._stack = AsyncExitStack()
+
+        read_stream, write_stream, _ = await self._stack.enter_async_context(
+            streamablehttp_client(self.url, headers=self.headers)
+        )
+
+        self.session = await self._stack.enter_async_context(ClientSession(read_stream, write_stream))
+        await asyncio.wait_for(self.session.initialize(), timeout=self.timeout)
+
+    async def stop(self):
+        """Close the session/transport cleanly."""
+        if self._stack:
+            await self._stack.aclose()
+        self._stack = None
+        self.session = None
+        self._tools_cache = None
+
+    async def list_tools(self) -> List[Dict[str, Any]]:
+        """Return tool definitions (cached)."""
+        if self._tools_cache is not None:
+            return self._tools_cache
+        if not self.session:
+            raise RuntimeError("MCP HTTP client not started")
+
+        resp = await asyncio.wait_for(self.session.list_tools(), timeout=self.timeout)
+        self._tools_cache = [t.model_dump() for t in resp.tools]
+        return self._tools_cache
+
+    async def call_tool(self, name: str, arguments: Dict[str, Any]) -> Any:
+        """Invoke a remote tool and return the structured result."""
+        if not self.session:
+            raise RuntimeError("MCP HTTP client not started")
+
+        result = await asyncio.wait_for(self.session.call_tool(name, arguments), timeout=self.timeout)
+        return result.model_dump()
diff --git a/src/agents/mcp/stdio_server.py b/src/agents/mcp/stdio_server.py
new file mode 100644
index 0000000..80368fd
--- /dev/null
+++ b/src/agents/mcp/stdio_server.py
@@ -0,0 +1,46 @@
+"""
+Minimal MCP Stdio Server Implementation
+========================================
+
+Provides stdio-based MCP server communication for services like
+Notion, Filesystem, Playwright, and Postgres.
+"""
+
+import asyncio
+import os
+from contextlib import AsyncExitStack
+from typing import Any, Dict, List, Optional
+
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+
+class MCPStdioServer:
+    """Lightweight wrapper around the official MCP Python SDK."""
+
+    def __init__(self, command: str, args: List[str], env: Optional[Dict[str, str]] = None, timeout: int = 120):
+        self.params = StdioServerParameters(command=command, args=args, env={**os.environ, **(env or {})})
+        self.timeout = timeout
+        self._stack: Optional[AsyncExitStack] = None
+        self._streams = None
+        self.session: Optional[ClientSession] = None
+
+    async def __aenter__(self):
+        self._stack = AsyncExitStack()
+        read, write = await self._stack.enter_async_context(stdio_client(self.params))
+        self.session = await self._stack.enter_async_context(ClientSession(read, write))
+        await asyncio.wait_for(self.session.initialize(), timeout=self.timeout)
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb):
+        if self._stack:
+            await self._stack.aclose()
+        self._stack = None
+        self.session = None
+
+    async def list_tools(self) -> List[Dict[str, Any]]:
+        resp = await asyncio.wait_for(self.session.list_tools(), timeout=self.timeout)
+        return [t.model_dump() for t in resp.tools]
+
+    async def call_tool(self, name: str, arguments: Dict[str, Any]) -> Any:
+        result = await asyncio.wait_for(self.session.call_tool(name, arguments), timeout=self.timeout)
+        return result.model_dump()  # 同上，转成 dict
diff --git a/src/agents/mcpmark_agent.py b/src/agents/mcpmark_agent.py
new file mode 100644
index 0000000..a6f729c
--- /dev/null
+++ b/src/agents/mcpmark_agent.py
@@ -0,0 +1,818 @@
+"""
+MCPMark Agent Implementation
+============================
+
+Unified agent using LiteLLM for all model interactions with minimal MCP support.
+"""
+
+import asyncio
+import json
+import time
+import uuid
+ 
+from typing import Any, Dict, List, Optional, Callable
+
+import litellm
+import nest_asyncio
+
+from src.logger import get_logger
+from .mcp import MCPStdioServer, MCPHttpServer
+from .utils import TokenUsageTracker
+
+# Apply nested asyncio support
+nest_asyncio.apply()
+
+# Configure LiteLLM
+litellm.suppress_debug_info = True
+
+logger = get_logger(__name__)
+
+class MCPMarkAgent:
+    """
+    Unified agent for LLM and MCP server management using LiteLLM.
+    
+    - Anthropic models: Native MCP support via extra_body
+    - Other models: Manual MCP server management with function calling
+    """
+    
+    # Constants
+    MAX_TURNS = 100
+    DEFAULT_TIMEOUT = 600
+    
+    # Service categories
+    STDIO_SERVICES = ["notion", "filesystem", "playwright", "playwright_webarena", "postgres"]
+    HTTP_SERVICES = ["github"]
+    
+    def __init__(
+        self,
+        litellm_input_model_name: str,
+        api_key: str,
+        base_url: str,
+        mcp_service: str,
+        timeout: int = DEFAULT_TIMEOUT,
+        service_config: Optional[Dict[str, Any]] = None,
+        service_config_provider: Optional[Callable[[], Dict]] = None,
+        reasoning_effort: Optional[str] = "default",
+    ):
+        """
+        Initialize the MCPMark agent.
+        
+        Args:
+            model_name: Name of the LLM model
+            api_key: API key for the model provider
+            base_url: Base url
+            mcp_service: MCP service type
+            timeout: Execution timeout in seconds
+            service_config: Service-specific configuration
+            service_config_provider: Optional provider for dynamic config
+            reasoning_effort: Reasoning effort level ("default", "minimal", "low", "medium", "high")
+        """
+        self.litellm_input_model_name = litellm_input_model_name
+        self.api_key = api_key
+        self.base_url = base_url
+        self.mcp_service = mcp_service
+        self.timeout = timeout
+        self.service_config = service_config or {}
+        self._service_config_provider = service_config_provider
+        self.reasoning_effort = reasoning_effort
+        
+        # Detect if this is an Anthropic model with HTTP MCP service
+        self.is_anthropic = self._is_anthropic_model(litellm_input_model_name)
+        self.use_anthropic_native = self.is_anthropic and mcp_service in self.HTTP_SERVICES
+        
+        # Initialize usage tracker
+        self.usage_tracker = TokenUsageTracker()
+        
+        # Track the actual model name from responses
+        self.litellm_run_model_name = None
+        
+        logger.debug(
+            f"Initialized MCPMarkAgent for '{mcp_service}' with model '{litellm_input_model_name}' "
+            f"(Anthropic Native MCP: {self.use_anthropic_native}, Reasoning: {reasoning_effort})"
+        )
+    
+    def _is_anthropic_model(self, model_name: str) -> bool:
+        """Check if the model is an Anthropic model."""
+        return "claude" in model_name.lower()
+    
+    
+    def _refresh_service_config(self):
+        """Refresh service config from provider if available."""
+        if self._service_config_provider:
+            try:
+                latest_cfg = self._service_config_provider() or {}
+                self.service_config.update(latest_cfg)
+            except Exception as e:
+                logger.warning(f"| Failed to refresh service config: {e}")
+    
+    async def execute(
+        self, 
+        instruction: str, 
+        tool_call_log_file: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Execute instruction with the agent.
+        
+        Args:
+            instruction: The instruction/prompt to execute
+            tool_call_log_file: Optional path to log tool calls
+            
+        Returns:
+            Dictionary containing execution results
+        """
+        start_time = time.time()
+        
+        try:
+            # Refresh service configuration
+            self._refresh_service_config()
+            
+            # Execute with timeout control
+            async def _execute_with_strategy():
+                if self.use_anthropic_native:
+                    # Use native MCP support only for Anthropic + HTTP MCP services (e.g., GitHub)
+                    return await self._execute_anthropic_native(
+                        instruction, tool_call_log_file
+                    )
+                else:
+                    # Use manual MCP management for all other cases
+                    # This includes: non-Anthropic models, or Anthropic with STDIO services
+                    return await self._execute_with_manual_mcp(
+                        instruction, tool_call_log_file
+                    )
+            
+            # Apply timeout to the entire execution
+            result = await asyncio.wait_for(
+                _execute_with_strategy(),
+                timeout=self.timeout
+            )
+            
+            execution_time = time.time() - start_time
+            
+            # Update usage statistics
+            self.usage_tracker.update(
+                success=result["success"],
+                token_usage=result.get("token_usage", {}),
+                turn_count=result.get("turn_count", 0),
+                execution_time=execution_time
+            )
+            
+            result["execution_time"] = execution_time
+            return result
+        
+        except asyncio.TimeoutError:
+            execution_time = time.time() - start_time
+            error_msg = f"Execution timed out after {self.timeout} seconds"
+            logger.error(error_msg)
+            
+            self.usage_tracker.update(
+                success=False,
+                token_usage={},
+                turn_count=0,
+                execution_time=execution_time
+            )
+            
+            return {
+                "success": False,
+                "output": [],
+                "token_usage": {},
+                "turn_count": 0,
+                "execution_time": execution_time,
+                "error": error_msg
+            }
+            
+        except Exception as e:
+            execution_time = time.time() - start_time
+            error_msg = f"Agent execution failed: {e}"
+            logger.error(error_msg, exc_info=True)
+            
+            self.usage_tracker.update(
+                success=False,
+                token_usage={},
+                turn_count=0,
+                execution_time=execution_time
+            )
+            
+            return {
+                "success": False,
+                "output": [],
+                "token_usage": {},
+                "turn_count": 0,
+                "execution_time": execution_time,
+                "error": str(e)
+            }
+    
+    async def _execute_anthropic_native(
+        self, 
+        instruction: str,
+        tool_call_log_file: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Execute using Anthropic's native MCP support for HTTP-based services.
+        Only used for Claude models with GitHub MCP service.
+        """
+        logger.debug("Using Anthropic native MCP execution for HTTP service")
+        
+        # Get MCP configuration for Anthropic
+        mcp_config = self._get_anthropic_mcp_config()
+        
+        # Prepare messages
+        messages = [{"role": "user", "content": instruction}]
+        
+        # Prepare extra headers and body for Anthropic
+        extra_headers = {
+            "anthropic-version": "2023-06-01",
+            "anthropic-beta": "mcp-client-2025-04-04",
+        }
+        
+        extra_body = {
+            "mcp_servers": [mcp_config],
+        }
+        
+        try:
+            return await self._get_anthropic_response(
+                messages, extra_headers, extra_body, tool_call_log_file
+            )
+        except Exception as e:
+            # Fallback: return minimal failure result with user message captured
+            logger.error(f"Anthropic native path failed: {e}", exc_info=True)
+            sdk_format_messages = self._convert_to_sdk_format(messages)
+            return {
+                "success": False,
+                "output": sdk_format_messages,
+                "token_usage": {},
+                "turn_count": 0,
+                "error": str(e),
+                "litellm_run_model_name": self.litellm_run_model_name,
+            }
+    
+    async def _get_anthropic_response(
+        self,
+        messages: List[Dict],
+        extra_headers: Dict,
+        extra_body: Dict,
+        tool_call_log_file: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Get non-streaming response from Anthropic."""
+        try:
+            # Build completion kwargs
+            completion_kwargs = {
+                "model": self.litellm_input_model_name,
+                "messages": messages,
+                "api_key": self.api_key,
+                "extra_headers": extra_headers,
+                "extra_body": extra_body,
+            }
+            
+            # Add reasoning_effort if specified
+            if self.reasoning_effort != "default":
+                completion_kwargs["reasoning_effort"] = self.reasoning_effort
+            
+            response = await litellm.acompletion(**completion_kwargs)
+            
+            # Extract actual model name from response
+            if hasattr(response, 'model') and response.model:
+                self.litellm_run_model_name = response.model
+            
+            # Extract response content
+            content = response.choices[0].message.content if response.choices else ""
+            
+            # Extract token usage including reasoning tokens
+            token_usage = {}
+            if hasattr(response, 'usage') and response.usage:
+                token_usage = {
+                    "input_tokens": response.usage.prompt_tokens or 0,
+                    "output_tokens": response.usage.completion_tokens or 0,
+                    "total_tokens": response.usage.total_tokens or 0
+                }
+                
+                # Extract reasoning tokens if available
+                if hasattr(response.usage, 'completion_tokens_details'):
+                    details = response.usage.completion_tokens_details
+                    if hasattr(details, 'reasoning_tokens'):
+                        token_usage["reasoning_tokens"] = details.reasoning_tokens or 0
+            
+            # Log to file if specified
+            if tool_call_log_file and content:
+                with open(tool_call_log_file, 'a', encoding='utf-8') as f:
+                    f.write(content + "\n")
+            
+            # Display token usage
+            if token_usage:
+                log_msg = (
+                    f"\n| Token usage: Total: {token_usage['total_tokens']:,} | "
+                    f"Input: {token_usage['input_tokens']:,} | "
+                    f"Output: {token_usage['output_tokens']:,}"
+                )
+                if "reasoning_tokens" in token_usage:
+                    log_msg += f" | Reasoning: {token_usage['reasoning_tokens']:,}"
+                logger.info(log_msg)
+            
+            # Convert to SDK format for backward compatibility
+            messages_with_response = messages + [{"role": "assistant", "content": content}]
+            sdk_format_messages = self._convert_to_sdk_format(messages_with_response)
+            
+            return {
+                "success": True,
+                "output": sdk_format_messages,
+                "token_usage": token_usage,
+                "turn_count": 1,
+                "error": None,
+                "litellm_run_model_name": self.litellm_run_model_name
+            }
+            
+        except Exception as e:
+            logger.error(f"Anthropic execution failed: {e}")
+            raise
+    
+    async def _execute_with_manual_mcp(
+        self,
+        instruction: str,
+        tool_call_log_file: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Execute with manual MCP server management.
+        Used for all non-Anthropic models and Anthropic models with STDIO services.
+        """
+        logger.debug("Using manual MCP execution with function calling loop")
+        
+        # Create and start MCP server
+        mcp_server = await self._create_mcp_server()
+        
+        try:
+            async with mcp_server:
+                # Get available tools
+                tools = await mcp_server.list_tools()
+                
+                # Convert MCP tools to OpenAI function format
+                functions = self._convert_mcp_tools_to_functions(tools)
+                
+                # Execute with function calling loop
+                return await self._execute_function_calling_loop(
+                    instruction, functions, mcp_server, tool_call_log_file
+                )
+                
+        except Exception as e:
+            logger.error(f"Manual MCP execution failed: {e}")
+            raise
+        
+    def _convert_to_sdk_format(self, messages: List[Dict]) -> List[Dict]:
+        """Convert OpenAI messages format to old SDK format for backward compatibility."""
+        sdk_format = []
+        function_call_map = {}  # Track function names to call IDs for legacy format
+
+        for msg in messages:
+            role = msg.get("role")
+
+            if role == "user":
+                # User messages stay mostly the same
+                sdk_format.append({
+                    "content": msg.get("content", ""),
+                    "role": "user"
+                })
+
+            elif role == "assistant":
+                # === CHANGED ORDER START ===
+                tool_calls = msg.get("tool_calls", [])
+                function_call = msg.get("function_call")
+                content = msg.get("content")
+
+                # 1) First add assistant's text content (if present)
+                if content:
+                    sdk_format.append({
+                        "id": "__fake_id__",
+                        "content": [
+                            {
+                                "annotations": [],
+                                "text": content if content else "",
+                                "type": "output_text"
+                            }
+                        ],
+                        "role": "assistant",
+                        "status": "completed",
+                        "type": "message"
+                    })
+
+                # 2) Then add (new format) tool_calls
+                if tool_calls:
+                    for tool_call in tool_calls:
+                        call_id = tool_call.get("id", f"call_{uuid.uuid4().hex}")
+                        func_name = tool_call.get("function", {}).get("name", "")
+                        sdk_format.append({
+                            "arguments": tool_call.get("function", {}).get("arguments", "{}"),
+                            "call_id": call_id,
+                            "name": func_name,
+                            "type": "function_call",
+                            "id": "__fake_id__"
+                        })
+
+                # 3) Finally handle (legacy format) function_call
+                if function_call:
+                    func_name = function_call.get("name", "")
+                    call_id = f"call_{uuid.uuid4().hex}"
+                    function_call_map[func_name] = call_id  # Store for matching responses
+                    sdk_format.append({
+                        "arguments": function_call.get("arguments", "{}"),
+                        "call_id": call_id,
+                        "name": func_name,
+                        "type": "function_call",
+                        "id": "__fake_id__"
+                    })
+
+                # 4) If neither content nor any calls exist, maintain fallback behavior
+                if not content and not tool_calls and not function_call:
+                    sdk_format.append({
+                        "id": "__fake_id__",
+                        "content": [
+                            {
+                                "annotations": [],
+                                "text": "",
+                                "type": "output_text"
+                            }
+                        ],
+                        "role": "assistant",
+                        "status": "completed",
+                        "type": "message"
+                    })
+                # === CHANGED ORDER END ===
+
+            elif role == "tool":
+                # Tool responses
+                sdk_format.append({
+                    "call_id": msg.get("tool_call_id", ""),
+                    "output": json.dumps({
+                        "type": "text",
+                        "text": msg.get("content", ""),
+                        "annotations": None,
+                        "meta": None
+                    }),
+                    "type": "function_call_output"
+                })
+
+            elif role == "function":
+                # Legacy function responses - try to match with stored call ID
+                func_name = msg.get("name", "")
+                call_id = function_call_map.get(func_name, f"call_{uuid.uuid4().hex}")
+                sdk_format.append({
+                    "call_id": call_id,
+                    "output": json.dumps({
+                        "type": "text",
+                        "text": msg.get("content", ""),
+                        "annotations": None,
+                        "meta": None
+                    }),
+                    "type": "function_call_output"
+                })
+
+        return sdk_format
+
+    
+    async def _execute_function_calling_loop(
+        self,
+        instruction: str,
+        functions: List[Dict],
+        mcp_server: Any,
+        tool_call_log_file: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Execute function calling loop with LiteLLM."""
+        messages = [
+            {"role": "system", "content": "You are a helpful agent that uses tools iteratively to complete the user's task, and when finished, provides the final answer or simply states \"Task completed\" without further tool calls."},
+            {"role": "user", "content": instruction}
+        ]
+        total_tokens = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0, "reasoning_tokens": 0}
+        turn_count = 0
+        max_turns = self.MAX_TURNS  # Limit turns to prevent infinite loops
+        consecutive_failures = 0
+        max_consecutive_failures = 3
+        
+        # Convert functions to tools format for newer models
+        tools = [{"type": "function", "function": func} for func in functions] if functions else None
+        
+        try:
+            while turn_count < max_turns:
+                
+                # Build completion kwargs
+                completion_kwargs = {
+                    "model": self.litellm_input_model_name,
+                    "messages": messages,
+                    "api_key": self.api_key,
+                }
+                
+                # Always use tools format if available - LiteLLM will handle conversion
+                if tools:
+                    completion_kwargs["tools"] = tools
+                    completion_kwargs["tool_choice"] = "auto"
+                
+                # Add reasoning_effort and base_url if specified
+                if self.reasoning_effort != "default":
+                    completion_kwargs["reasoning_effort"] = self.reasoning_effort
+                if self.base_url:
+                    completion_kwargs["base_url"] = self.base_url
+                
+                try:
+                    # Call LiteLLM with timeout for individual call
+                    response = await asyncio.wait_for(
+                        litellm.acompletion(**completion_kwargs),
+                        timeout = self.timeout / 2  # Use half of total timeout
+                    )
+                    consecutive_failures = 0  # Reset failure counter on success
+                except asyncio.TimeoutError:
+                    logger.warning(f"| ✗ LLM call timed out on turn {turn_count + 1}")
+                    consecutive_failures += 1
+                    if consecutive_failures >= max_consecutive_failures:
+                        raise Exception(f"Too many consecutive failures ({consecutive_failures})")
+                    await asyncio.sleep(8 ** consecutive_failures)  # Exponential backoff
+                    continue
+                except Exception as e:
+                    logger.error(f"| ✗ LLM call failed on turn {turn_count + 1}: {e}")
+                    consecutive_failures += 1
+                    if consecutive_failures >= max_consecutive_failures:
+                        raise
+                    await asyncio.sleep(8 ** consecutive_failures)  # Exponential backoff
+                    continue
+                
+                # Extract actual model name from response (first turn only)
+                if turn_count == 0 and hasattr(response, 'model') and response.model:
+                    self.litellm_run_model_name = response.model.split("/")[-1]
+                
+                # Update token usage including reasoning tokens
+                if hasattr(response, 'usage') and response.usage:
+                    total_tokens["input_tokens"] += response.usage.prompt_tokens or 0
+                    total_tokens["output_tokens"] += response.usage.completion_tokens or 0
+                    total_tokens["total_tokens"] += response.usage.total_tokens or 0
+                    
+                    # Extract reasoning tokens if available
+                    if hasattr(response.usage, 'completion_tokens_details'):
+                        details = response.usage.completion_tokens_details
+                        if hasattr(details, 'reasoning_tokens'):
+                            total_tokens["reasoning_tokens"] += details.reasoning_tokens or 0
+                
+                # Get response message
+                choices = response.choices
+                if len(choices):
+                    message = choices[0].message
+                else:
+                    break
+                
+                # Convert to dict (prefer model_dump over deprecated dict())
+                message_dict = message.model_dump() if hasattr(message, 'model_dump') else dict(message)
+                
+                # Log assistant's text content if present
+                if hasattr(message, 'content') and message.content:
+                    # Display the content with line prefix
+                    for line in message.content.splitlines():
+                        logger.info(f"| {line}")
+                    
+                    # Also log to file if specified
+                    if tool_call_log_file:
+                        with open(tool_call_log_file, 'a', encoding='utf-8') as f:
+                            f.write(f"{message.content}\n")
+                
+                # Check for tool calls (newer format)
+                if hasattr(message, 'tool_calls') and message.tool_calls:
+                    messages.append(message_dict)
+                    turn_count += 1
+                    # Process tool calls
+                    for tool_call in message.tool_calls:
+                        func_name = tool_call.function.name
+                        func_args = json.loads(tool_call.function.arguments)
+                        
+                        try:
+                            result = await asyncio.wait_for(
+                                mcp_server.call_tool(func_name, func_args),
+                                timeout=30
+                            )
+                            messages.append({
+                                "role": "tool",
+                                "tool_call_id": tool_call.id,
+                                "content": json.dumps(result)
+                            })
+                        except asyncio.TimeoutError:
+                            error_msg = f"Tool call '{func_name}' timed out after 30 seconds"
+                            logger.error(error_msg)
+                            messages.append({
+                                "role": "tool",
+                                "tool_call_id": tool_call.id,
+                                "content": f"Error: {error_msg}"
+                            })
+                        except Exception as e:
+                            logger.error(f"Tool call failed: {e}")
+                            messages.append({
+                                "role": "tool",
+                                "tool_call_id": tool_call.id,
+                                "content": f"Error: {str(e)}"
+                            })   
+                            
+                        # Format arguments for display (truncate if too long)
+                        args_str = json.dumps(func_args, separators=(",", ": "))
+                        display_arguments = args_str[:140] + "..." if len(args_str) > 140 else args_str
+                        
+                        # Log with ANSI color codes (bold tool name, dim gray arguments)
+                        logger.info(f"| \033[1m{func_name}\033[0m \033[2;37m{display_arguments}\033[0m")
+                        
+                        if tool_call_log_file:
+                            with open(tool_call_log_file, 'a', encoding='utf-8') as f:
+                                f.write(f"| {func_name} {args_str}\n")
+                    continue
+                else:
+                    # No tool/function call, add message and we're done
+                    messages.append(message_dict)
+                    turn_count += 1
+                    break
+        except Exception as loop_error:
+            # On any error, return partial conversation, token usage, and turn count
+            logger.error(f"Manual MCP loop failed: {loop_error}", exc_info=True)
+            sdk_format_messages = self._convert_to_sdk_format(messages)
+            return {
+                "success": False,
+                "output": sdk_format_messages,
+                "token_usage": total_tokens,
+                "turn_count": turn_count,
+                "error": str(loop_error),
+                "litellm_run_model_name": self.litellm_run_model_name,
+            }
+        
+        # Display final token usage
+        if total_tokens["total_tokens"] > 0:
+            log_msg = (
+                f"|\n| Token usage: Total: {total_tokens['total_tokens']:,} | "
+                f"Input: {total_tokens['input_tokens']:,} | "
+                f"Output: {total_tokens['output_tokens']:,}"
+            )
+            if total_tokens.get("reasoning_tokens", 0) > 0:
+                log_msg += f" | Reasoning: {total_tokens['reasoning_tokens']:,}"
+            logger.info(log_msg)
+            logger.info(f"| Turns: {turn_count}")
+        
+        # Convert messages to SDK format for backward compatibility
+        sdk_format_messages = self._convert_to_sdk_format(messages)
+        
+        return {
+            "success": True,
+            "output": sdk_format_messages,
+            "token_usage": total_tokens,
+            "turn_count": turn_count,
+            "error": None,
+            "litellm_run_model_name": self.litellm_run_model_name
+        }
+    
+    def _convert_mcp_tools_to_functions(self, tools: List[Dict]) -> List[Dict]:
+        """Convert MCP tool definitions to OpenAI function format."""
+        functions = []
+        
+        for tool in tools:
+            function = {
+                "name": tool.get("name"),
+                "description": tool.get("description", ""),
+                "parameters": tool.get("inputSchema", {
+                    "type": "object",
+                    "properties": {},
+                    "required": []
+                })
+            }
+            functions.append(function)
+        
+        return functions
+
+    
+    def _get_anthropic_mcp_config(self) -> Dict[str, Any]:
+        """Get MCP configuration for Anthropic native support."""
+        if self.mcp_service == "github":
+            return {
+                "type": "url",
+                "url": "https://api.githubcopilot.com/mcp/",
+                "name": "github",
+                "authorization_token": self.service_config.get("github_token", "")
+            }
+        else:
+            # For stdio-based services, Anthropic expects a different format
+            # This would need to be implemented based on Anthropic's requirements
+            raise NotImplementedError(
+                f"Anthropic native MCP for {self.mcp_service} not yet implemented"
+            )
+    
+    async def _create_mcp_server(self) -> Any:
+        """Create and return an MCP server instance."""
+        if self.mcp_service in self.STDIO_SERVICES:
+            return self._create_stdio_server()
+        elif self.mcp_service in self.HTTP_SERVICES:
+            return self._create_http_server()
+        else:
+            raise ValueError(f"Unsupported MCP service: {self.mcp_service}")
+    
+    def _create_stdio_server(self) -> MCPStdioServer:
+        """Create stdio-based MCP server."""
+        if self.mcp_service == "notion":
+            notion_key = self.service_config.get("notion_key")
+            if not notion_key:
+                raise ValueError("Notion API key required")
+            
+            return MCPStdioServer(
+                command="npx",
+                args=["-y", "@notionhq/notion-mcp-server"],
+                env={
+                    "OPENAPI_MCP_HEADERS": (
+                        '{"Authorization": "Bearer ' + notion_key + '", '
+                        '"Notion-Version": "2022-06-28"}'
+                    )
+                }
+            )
+        
+        elif self.mcp_service == "filesystem":
+            test_directory = self.service_config.get("test_directory")
+            if not test_directory:
+                raise ValueError("Test directory required for filesystem service")
+            
+            return MCPStdioServer(
+                command="npx",
+                args=["-y", "@modelcontextprotocol/server-filesystem", str(test_directory)]
+            )
+        
+        elif self.mcp_service in ["playwright", "playwright_webarena"]:
+            browser = self.service_config.get("browser", "chromium")
+            headless = self.service_config.get("headless", True)
+            viewport_width = self.service_config.get("viewport_width", 1280)
+            viewport_height = self.service_config.get("viewport_height", 720)
+            
+            args = ["-y", "@playwright/mcp@latest"]
+            if headless:
+                args.append("--headless")
+            args.extend([
+                "--isolated",
+                "--no-sandbox",
+                "--browser", browser,
+                "--viewport-size", f"{viewport_width},{viewport_height}"
+            ])
+            
+            return MCPStdioServer(command="npx", args=args)
+        
+        elif self.mcp_service == "postgres":
+            host = self.service_config.get("host", "localhost")
+            port = self.service_config.get("port", 5432)
+            username = self.service_config.get("username")
+            password = self.service_config.get("password")
+            database = self.service_config.get("current_database") or self.service_config.get("database")
+            
+            if not all([username, password, database]):
+                raise ValueError("PostgreSQL requires username, password, and database")
+            
+            database_url = f"postgresql://{username}:{password}@{host}:{port}/{database}"
+            
+            return MCPStdioServer(
+                command="pipx",
+                args=["run", "postgres-mcp", "--access-mode=unrestricted"],
+                env={"DATABASE_URI": database_url}
+            )
+        
+        else:
+            raise ValueError(f"Unsupported stdio service: {self.mcp_service}")
+    
+    def _create_http_server(self) -> MCPHttpServer:
+        """Create HTTP-based MCP server."""
+        if self.mcp_service == "github":
+            github_token = self.service_config.get("github_token")
+            if not github_token:
+                raise ValueError("GitHub token required")
+            
+            return MCPHttpServer(
+                url="https://api.githubcopilot.com/mcp/",
+                headers={
+                    "Authorization": f"Bearer {github_token}",
+                    "User-Agent": "MCPMark/1.0"
+                }
+            )
+        else:
+            raise ValueError(f"Unsupported HTTP service: {self.mcp_service}")
+    
+    def execute_sync(
+        self,
+        instruction: str,
+        tool_call_log_file: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Synchronous wrapper for execute method.
+        """
+        try:
+            return asyncio.run(self.execute(instruction, tool_call_log_file))
+        except asyncio.TimeoutError:
+            self.usage_tracker.update(False, {}, 0, self.timeout)
+            return {
+                "success": False,
+                "output": [],
+                "token_usage": {},
+                "turn_count": 0,
+                "execution_time": self.timeout,
+                "error": f"Execution timed out after {self.timeout} seconds"
+            }
+    
+    def get_usage_stats(self) -> Dict[str, Any]:
+        """Get usage statistics."""
+        return self.usage_tracker.get_stats()
+    
+    def reset_usage_stats(self):
+        """Reset usage statistics."""
+        self.usage_tracker.reset()
+    
+    def __repr__(self):
+        return (
+            f"MCPMarkAgent(service='{self.mcp_service}', model='{self.litellm_input_model_name}', "
+        )
\ No newline at end of file
diff --git a/src/agents/utils/__init__.py b/src/agents/utils/__init__.py
new file mode 100644
index 0000000..f5b2242
--- /dev/null
+++ b/src/agents/utils/__init__.py
@@ -0,0 +1,8 @@
+"""
+Utility functions for MCPMark Agent
+====================================
+"""
+
+from .token_usage import TokenUsageTracker
+
+__all__ = ["TokenUsageTracker"]
\ No newline at end of file
diff --git a/src/agents/utils/token_usage.py b/src/agents/utils/token_usage.py
new file mode 100644
index 0000000..17ac0f4
--- /dev/null
+++ b/src/agents/utils/token_usage.py
@@ -0,0 +1,78 @@
+"""
+Token Usage Tracking Utilities
+===============================
+"""
+
+from typing import Dict, Any
+
+
+class TokenUsageTracker:
+    """Track token usage across agent executions."""
+    
+    def __init__(self):
+        """Initialize token usage tracker."""
+        self.reset()
+    
+    def reset(self):
+        """Reset all usage statistics."""
+        self._stats = {
+            "total_input_tokens": 0,
+            "total_output_tokens": 0,
+            "total_tokens": 0,
+            "total_turns": 0,
+            "total_execution_time": 0.0,
+            "successful_executions": 0,
+            "failed_executions": 0,
+        }
+    
+    def update(self, success: bool, token_usage: Dict[str, int], 
+               turn_count: int, execution_time: float):
+        """
+        Update usage statistics.
+        
+        Args:
+            success: Whether execution was successful
+            token_usage: Token usage dict with input_tokens, output_tokens, total_tokens
+            turn_count: Number of conversation turns
+            execution_time: Execution time in seconds
+        """
+        if success:
+            self._stats["successful_executions"] += 1
+        else:
+            self._stats["failed_executions"] += 1
+        
+        self._stats["total_input_tokens"] += token_usage.get("input_tokens", 0)
+        self._stats["total_output_tokens"] += token_usage.get("output_tokens", 0)
+        self._stats["total_tokens"] += token_usage.get("total_tokens", 0)
+        self._stats["total_turns"] += turn_count
+        self._stats["total_execution_time"] += execution_time
+    
+    def get_stats(self) -> Dict[str, Any]:
+        """
+        Get usage statistics with calculated averages.
+        
+        Returns:
+            Dictionary containing usage statistics
+        """
+        stats = self._stats.copy()
+        
+        # Calculate averages
+        total_executions = stats["successful_executions"] + stats["failed_executions"]
+        if total_executions > 0:
+            stats["avg_input_tokens"] = stats["total_input_tokens"] / total_executions
+            stats["avg_output_tokens"] = stats["total_output_tokens"] / total_executions
+            stats["avg_total_tokens"] = stats["total_tokens"] / total_executions
+            stats["avg_turns"] = stats["total_turns"] / total_executions
+            stats["avg_execution_time"] = stats["total_execution_time"] / total_executions
+            stats["success_rate"] = (stats["successful_executions"] / total_executions * 100)
+        else:
+            stats.update({
+                "avg_input_tokens": 0.0,
+                "avg_output_tokens": 0.0,
+                "avg_total_tokens": 0.0,
+                "avg_turns": 0.0,
+                "avg_execution_time": 0.0,
+                "success_rate": 0.0,
+            })
+        
+        return stats
\ No newline at end of file
diff --git a/src/aggregators/aggregate_results.py b/src/aggregators/aggregate_results.py
index c2e8344..1852404 100644
--- a/src/aggregators/aggregate_results.py
+++ b/src/aggregators/aggregate_results.py
@@ -127,7 +127,7 @@ def collect_task_results_from_run(
                     "task_execution_time": meta.get("task_execution_time", 0),
                     "token_usage": meta.get("token_usage", {}),
                     "turn_count": meta.get("turn_count", 0),
-                    "actual_model_name": meta.get("actual_model_name"),
+                    "model_name_name": meta.get("model_name_name"),
                     "meta": meta,  # Keep full meta for model_results
                 }
             except Exception as e:
@@ -164,7 +164,7 @@ def calculate_k_run_metrics(
             "total_output_tokens": 0,
             "total_tokens": 0,
             "total_turns": 0,
-            "actual_model_name": None,
+            "model_name_name": None,
         }
     )
 
@@ -240,14 +240,14 @@ def calculate_k_run_metrics(
         service_model_metrics[service_model]["total_tokens"] += sum(task_total_tokens)
         service_model_metrics[service_model]["total_turns"] += sum(task_turns)
 
-        # Store actual_model_name from first available task result
-        if service_model_metrics[service_model]["actual_model_name"] is None:
+        # Store model_name_name from first available task result
+        if service_model_metrics[service_model]["model_name_name"] is None:
             for run_name in runs_to_process:
                 if run_name in all_runs_results:
                     run_results = all_runs_results[run_name]
                     task_result = run_results.get(task_key)
-                    if task_result and task_result.get("actual_model_name"):
-                        service_model_metrics[service_model]["actual_model_name"] = task_result["actual_model_name"]
+                    if task_result and task_result.get("model_name_name"):
+                        service_model_metrics[service_model]["model_name_name"] = task_result["model_name_name"]
                         break
 
         # pass@1: Will be calculated as avg@k later (skip individual task counting)
@@ -372,7 +372,7 @@ def aggregate_single_run_results(
         total_output_tokens = 0
         total_tokens = 0
         total_turns = 0
-        actual_model_name = None
+        model_name_name = None
 
         for task_dir in service_model_dir.iterdir():
             if not task_dir.is_dir():
@@ -403,9 +403,9 @@ def aggregate_single_run_results(
                 total_tokens += token_usage.get("total_tokens", 0) or 0
                 total_turns += meta.get("turn_count", 0) or 0
                 
-                # Store actual_model_name from first available meta
-                if actual_model_name is None and meta.get("actual_model_name"):
-                    actual_model_name = meta.get("actual_model_name")
+                # Store model_name_name from first available meta
+                if model_name_name is None and meta.get("model_name_name"):
+                    model_name_name = meta.get("model_name_name")
 
             except Exception as e:
                 print(f"⚠️  Error reading {meta_path}: {e}")
@@ -428,7 +428,7 @@ def aggregate_single_run_results(
                 "avg_output_tokens": round(total_output_tokens / total_tasks, 4),
                 "avg_total_tokens": round(total_tokens / total_tasks, 4),
                 "avg_turns": round(total_turns / total_tasks, 4),
-                "actual_model_name": actual_model_name,
+                "model_name_name": model_name_name,
             }
 
     return service_model_results
@@ -529,11 +529,11 @@ def create_simplified_summary(
                 model_metrics["per_run_output_tokens"] = per_run_output_tokens
                 model_metrics["per_run_cost"] = per_run_cost
                 
-                # Set actual_model_name from first available service data
+                # Set model_name_name from first available service data
                 for service_model, metrics in service_model_results.items():
                     if "__" in service_model and service_model.split("__", 1)[1] == model:
-                        if metrics.get("actual_model_name"):
-                            model_metrics["actual_model_name"] = metrics["actual_model_name"]
+                        if metrics.get("model_name_name"):
+                            model_metrics["model_name_name"] = metrics["model_name_name"]
                             break
 
             if k > 1:
@@ -697,8 +697,8 @@ def create_simplified_summary(
                         "per_run_cost": per_run_cost,
                     }
 
-                # Add actual_model_name to service-level metrics
-                formatted_metrics["actual_model_name"] = metrics.get("actual_model_name")
+                # Add model_name_name to service-level metrics
+                formatted_metrics["model_name_name"] = metrics.get("model_name_name")
                 summary[service][model] = formatted_metrics
 
     return summary
diff --git a/src/errors.py b/src/errors.py
index ca2742a..36eb8c3 100644
--- a/src/errors.py
+++ b/src/errors.py
@@ -9,23 +9,26 @@
 from typing import Optional
 
 
-# Retryable error patterns
+"""Retryable error detection via minimal substring matching (lower-case)."""
+
+# Keep this list short and generic; aim to catch API/infrastructure issues only.
 RETRYABLE_PATTERNS = {
-    "timeout",
-    "timed out",
-    "etimedout",
-    "econnrefused",
-    "connection refused",
-    "network error",
+    "ratelimit",              # e.g., RateLimitError, too many requests
+    "timeout",                # any timeout wording
+    "connection",             # connection refused/reset/error
+    "unavailable",            # service unavailable
+    "internal server error",  # 500s
+    "network error",          # generic network issue
+    "quota",                  # budget/quota exceeded
+    # pipeline infra signals
     "mcp network error",
     "state duplication error",
-    "already exists",
 }
 
 
 def is_retryable_error(error: str) -> bool:
-    """Check if an error message indicates it should be retried."""
-    error_lower = str(error).lower()
+    """Return True if the error string contains any retryable pattern."""
+    error_lower = str(error or "").lower()
     return any(pattern in error_lower for pattern in RETRYABLE_PATTERNS)
 
 
@@ -57,8 +60,3 @@ def standardize_error_message(error: str, mcp_service: Optional[str] = None) ->
         return f"{mcp_service.title()} {base_msg}"
 
     return base_msg
-
-
-def get_retry_delay(attempt: int, base_delay: int = 5) -> int:
-    """Get exponential backoff delay for retries."""
-    return min(base_delay * (2 ** (attempt - 1)), 60)  # Cap at 60 seconds
diff --git a/src/evaluator.py b/src/evaluator.py
index 819ad7f..aed77a2 100644
--- a/src/evaluator.py
+++ b/src/evaluator.py
@@ -10,12 +10,8 @@
 from src.factory import MCPServiceFactory
 from src.model_config import ModelConfig
 from src.results_reporter import EvaluationReport, ResultsReporter, TaskResult
-from src.agent import MCPAgent
-
-PIPELINE_RETRY_ERRORS: List[str] = [
-    "State Duplication Error",
-    "MCP Network Error",
-]
+from src.errors import is_retryable_error
+from src.agents import MCPMarkAgent
 
 # Initialize logger
 logger = get_logger(__name__)
@@ -29,18 +25,23 @@ def __init__(
         timeout: int = 300,
         exp_name: str = "test-run",
         output_dir: Path = None,
-        stream: bool = False,
+        reasoning_effort: str = "default",
     ):
         # Main configuration
         self.mcp_service = mcp_service
-        self.model = model
         self.timeout = timeout
-
+        
         # Initialize model configuration
-        model_config = ModelConfig(model)
-        self.actual_model_name = model_config.actual_model_name
-        self.base_url = model_config.base_url
+        self.reasoning_effort = reasoning_effort
+        self.model_name = model
+        
+        model_config = ModelConfig(self.model_name)
         self.api_key = model_config.api_key
+        self.base_url = model_config.base_url
+        self.litellm_input_model_name = model_config.litellm_input_model_name
+        
+        # Track the actual model name from LiteLLM responses
+        self.litellm_run_model_name = None
 
         # Initialize managers using the factory pattern (simplified)
         self.task_manager = MCPServiceFactory.create_task_manager(mcp_service)
@@ -53,22 +54,25 @@ def __init__(
         # automatically refresh its service configuration from the state
         # manager before each execution, so per-task manual updates are no
         # longer needed.
-        self.agent = MCPAgent(
-            model_name=self.actual_model_name,
+        self.agent = MCPMarkAgent(
+            litellm_input_model_name=self.litellm_input_model_name,  # Use the original model name for detection
             api_key=self.api_key,
             base_url=self.base_url,
             mcp_service=mcp_service,
             timeout=timeout,
             service_config=self.service_config,
             service_config_provider=self.state_manager.get_service_config_for_agent,
-            stream=stream,
+            reasoning_effort=self.reasoning_effort,
         )
 
         # Initialize results reporter
         self.results_reporter = ResultsReporter()
 
         # Output directory handling
-        model_slug = self.model.replace(".", "-")
+        if self.reasoning_effort != "default":
+            model_slug = self.model_name.replace(".", "-") + "-" + self.reasoning_effort
+        else:
+            model_slug = self.model_name.replace(".", "-")
         self.base_experiment_dir = output_dir / exp_name / f"{mcp_service}__{model_slug}"
         self.base_experiment_dir.mkdir(parents=True, exist_ok=True)
 
@@ -216,6 +220,10 @@ def _run_single_task(self, task) -> TaskResult:
         )
 
         agent_execution_time = time.time() - agent_execution_start_time
+        
+        # Extract actual model name from LiteLLM response
+        if agent_result.get("litellm_run_model_name"):
+            self.litellm_run_model_name = agent_result["litellm_run_model_name"]
 
         # Write messages.json to task_output_dir
         messages_path = task_output_dir / "messages.json"
@@ -284,7 +292,7 @@ def run_evaluation(self, task_filter: str) -> EvaluationReport:
             retry_due_to_error = (
                 existing_result is not None
                 and not existing_result.success
-                and existing_result.error_message in PIPELINE_RETRY_ERRORS
+                and is_retryable_error(existing_result.error_message)
             )
 
             if existing_result and not retry_due_to_error:
@@ -334,7 +342,9 @@ def run_evaluation(self, task_filter: str) -> EvaluationReport:
             meta_path = task_output_dir / "meta.json"
             model_config = {
                 "mcp_service": self.mcp_service,
-                "model_name": self.actual_model_name,
+                "model_name": self.model_name,
+                "litellm_run_model_name": self.litellm_run_model_name,
+                "reasoning_effort": self.reasoning_effort,
                 "timeout": self.timeout,
             }
             self.results_reporter.save_meta_json(
@@ -375,10 +385,12 @@ def _matches_filter(tr: TaskResult, flt: str) -> bool:
         final_results = list(merged.values())
 
         aggregated_report = EvaluationReport(
-            model_name=self.model,
+            model_name=self.model_name,
             model_config={
                 "mcp_service": self.mcp_service,
-                "model_name": self.actual_model_name,
+                "model_name": self.model_name,
+                "litellm_run_model_name": self.litellm_run_model_name,
+                "reasoning_effort": self.reasoning_effort,
                 "timeout": self.timeout,
             },
             total_tasks=len(final_results),
diff --git a/src/model_config.py b/src/model_config.py
index b51b378..ff2f9b0 100644
--- a/src/model_config.py
+++ b/src/model_config.py
@@ -28,123 +28,130 @@ class ModelConfig:
         "gpt-4o": {
             "provider": "openai",
             "api_key_var": "OPENAI_API_KEY",
-            "base_url_var": "OPENAI_BASE_URL",
-            "actual_model_name": "gpt-4o",
+            "litellm_input_model_name": "openai/gpt-4o",
         },
         "gpt-4.1": {
             "provider": "openai",
             "api_key_var": "OPENAI_API_KEY",
-            "base_url_var": "OPENAI_BASE_URL",
-            "actual_model_name": "gpt-4.1",
+            "litellm_input_model_name": "openai/gpt-4.1",
         },
         "gpt-4.1-mini": {
             "provider": "openai",
             "api_key_var": "OPENAI_API_KEY",
-            "base_url_var": "OPENAI_BASE_URL",
-            "actual_model_name": "gpt-4.1-mini",
+            "litellm_input_model_name": "openai/gpt-4.1-mini",
         },
         "gpt-5": {
             "provider": "openai",
             "api_key_var": "OPENAI_API_KEY",
-            "base_url_var": "OPENAI_BASE_URL",
-            "actual_model_name": "gpt-5",
+            "litellm_input_model_name": "openai/gpt-5",
         },
         "gpt-5-mini": {
             "provider": "openai",
             "api_key_var": "OPENAI_API_KEY",
-            "base_url_var": "OPENAI_BASE_URL",
-            "actual_model_name": "gpt-5-mini",
+            "litellm_input_model_name": "openai/gpt-5-mini",
         },
         "gpt-5-nano": {
             "provider": "openai",
             "api_key_var": "OPENAI_API_KEY",
-            "base_url_var": "OPENAI_BASE_URL",
-            "actual_model_name": "gpt-5-nano",
+            "litellm_input_model_name": "openai/gpt-5-nano",
         },
         "o3": {
             "provider": "openai",
             "api_key_var": "OPENAI_API_KEY",
-            "base_url_var": "OPENAI_BASE_URL",
-            "actual_model_name": "o3",
+            "litellm_input_model_name": "openai/o3",
         },
         "o4-mini": {
             "provider": "openai",
             "api_key_var": "OPENAI_API_KEY",
-            "base_url_var": "OPENAI_BASE_URL",
-            "actual_model_name": "o4-mini",
+            "litellm_input_model_name": "openai/o4-mini",
+        },
+        "gpt-oss-120b": {
+            "provider": "openai",
+            "api_key_var": "OPENROUTER_API_KEY",
+            "litellm_input_model_name": "openrouter/openai/gpt-oss-120b",
         },
         # DeepSeek models
         "deepseek-chat": {
             "provider": "deepseek",
             "api_key_var": "DEEPSEEK_API_KEY",
-            "base_url_var": "DEEPSEEK_BASE_URL",
-            "actual_model_name": "deepseek-chat",
+            "litellm_input_model_name": "deepseek/deepseek-chat",
         },
         "deepseek-reasoner": {
             "provider": "deepseek",
             "api_key_var": "DEEPSEEK_API_KEY",
-            "base_url_var": "DEEPSEEK_BASE_URL",
-            "actual_model_name": "deepseek-reasoner",
+            "litellm_input_model_name": "deepseek/deepseek-reasoner",
         },
         # Anthropic models
-        "claude-3-7-sonnet": {
+        "claude-3.7-sonnet": {
             "provider": "anthropic",
             "api_key_var": "ANTHROPIC_API_KEY",
-            "base_url_var": "ANTHROPIC_BASE_URL",
-            "actual_model_name": "claude-3-7-sonnet-20250219",
+            "litellm_input_model_name": "anthropic/claude-3-7-sonnet-20250219",
         },
-        "claude-4-sonnet": {
+        "claude-sonnet-4": {
             "provider": "anthropic",
             "api_key_var": "ANTHROPIC_API_KEY",
-            "base_url_var": "ANTHROPIC_BASE_URL",
-            "actual_model_name": "claude-sonnet-4-20250514",
+            "litellm_input_model_name": "anthropic/claude-sonnet-4-20250514",
         },
-        "claude-4-opus": {
+        "claude-opus-4": {
             "provider": "anthropic",
             "api_key_var": "ANTHROPIC_API_KEY",
-            "base_url_var": "ANTHROPIC_BASE_URL",
-            "actual_model_name": "claude-opus-4-20250514",
+            "litellm_input_model_name": "anthropic/claude-opus-4-20250514",
         },
-        "claude-4.1-opus": {
+        "claude-opus-4.1": {
             "provider": "anthropic",
             "api_key_var": "ANTHROPIC_API_KEY",
-            "base_url_var": "ANTHROPIC_BASE_URL",
-            "actual_model_name": "claude-opus-4-1-20250805",
+            "litellm_input_model_name": "anthropic/claude-opus-4-1-20250805",
         },
         # Google models
         "gemini-2.5-pro": {
             "provider": "google",
             "api_key_var": "GEMINI_API_KEY",
-            "base_url_var": "GEMINI_BASE_URL",
-            "actual_model_name": "gemini-2.5-pro",
+            "litellm_input_model_name": "gemini/gemini-2.5-pro",
         },
         "gemini-2.5-flash": {
             "provider": "google",
             "api_key_var": "GEMINI_API_KEY",
-            "base_url_var": "GEMINI_BASE_URL",
-            "actual_model_name": "gemini-2.5-flash",
+            "litellm_input_model_name": "gemini/gemini-2.5-flash",
         },
         # Moonshot models
         "k2": {
             "provider": "moonshot",
             "api_key_var": "MOONSHOT_API_KEY",
-            "base_url_var": "MOONSHOT_BASE_URL",
-            "actual_model_name": "kimi-k2-0711-preview",
+            "litellm_input_model_name": "moonshot/kimi-k2-0711-preview",
+        },
+        "k2-turbo": {
+            "provider": "moonshot",
+            "api_key_var": "MOONSHOT_API_KEY",
+            "litellm_input_model_name": "moonshot/kimi-k2-turbo-preview",
         },
         # Grok models
         "grok-4": {
             "provider": "xai",
             "api_key_var": "GROK_API_KEY",
-            "base_url_var": "GROK_BASE_URL",
-            "actual_model_name": "grok-4-0709",
+            "litellm_input_model_name": "xai/grok-4-0709",
+        },
+        "grok-code-fast-1": {
+            "provider": "xai",
+            "api_key_var": "GROK_API_KEY",
+            "litellm_input_model_name": "xai/grok-code-fast-1",
         },
         # Qwen models
         "qwen-3-coder": {
             "provider": "qwen",
-            "api_key_var": "QWEN_API_KEY",
-            "base_url_var": "QWEN_BASE_URL",
-            "actual_model_name": "qwen/qwen3-coder",
+            "api_key_var": "OPENROUTER_API_KEY",
+            "litellm_input_model_name": "openrouter/qwen/qwen3-coder",
         },
+        "qwen-3-coder-plus": {
+            "provider": "qwen",
+            "api_key_var": "DASHSCOPE_API_KEY",
+            "litellm_input_model_name": "dashscope/qwen3-coder-plus",
+        },
+        # Zhipu
+        "glm-4.5": {
+            "provider": "zhipu",
+            "api_key_var": "OPENROUTER_API_KEY",
+            "litellm_input_model_name": "openrouter/z-ai/glm-4.5",
+        }
     }
 
     def __init__(self, model_name: str):
@@ -157,25 +164,22 @@ def __init__(self, model_name: str):
         Raises:
             ValueError: If the model is not supported or environment variables are missing.
         """
-        self.model_name = model_name
+        self.short_model_name = model_name
         model_info = self._get_model_info(model_name)
 
-        # Load API key and base URL from environment variables
+        # Load API key, base URL and LiteLLM model name from environment variables
+        if "base_url_var" in model_info:
+            self.base_url = os.getenv(model_info["base_url_var"])
+        else:
+            self.base_url = None
+        
         self.api_key = os.getenv(model_info["api_key_var"])
         if not self.api_key:
             raise ValueError(
                 f"Missing required environment variable: {model_info['api_key_var']}"
             )
 
-        self.base_url = os.getenv(model_info["base_url_var"])
-        if not self.base_url:
-            raise ValueError(
-                f"Missing required environment variable: {model_info['base_url_var']}"
-            )
-
-        # Store provider and the actual model name for the API
-        self.provider = model_info["provider"]
-        self.actual_model_name = model_info.get("actual_model_name", model_name)
+        self.litellm_input_model_name = model_info.get("litellm_input_model_name", model_name)
 
     def _get_model_info(self, model_name: str) -> Dict[str, str]:
         """
@@ -190,8 +194,7 @@ def _get_model_info(self, model_name: str) -> Dict[str, str]:
             return {
                 "provider": "openai",
                 "api_key_var": "OPENAI_API_KEY",
-                "base_url_var": "OPENAI_BASE_URL",
-                "actual_model_name": model_name,
+                "litellm_input_model_name": model_name,
             }
         return self.MODEL_CONFIGS[model_name]
 
@@ -209,10 +212,8 @@ def main():
         # Example: Create a model config for DeepSeek
         model_config = ModelConfig("deepseek-chat")
         logger.info("✅ DeepSeek model config created successfully.")
-        logger.info("Provider: %s", model_config.provider)
-        logger.info("Actual model name: %s", model_config.actual_model_name)
+        logger.info("Short model name: %s", model_config.short_model_name)
         logger.info("API key loaded: %s", bool(model_config.api_key))
-        logger.info("Base URL: %s", model_config.base_url)
 
     except ValueError as e:
         logger.error("⚠️  Configuration error: %s", e)
diff --git a/src/results_reporter.py b/src/results_reporter.py
index 40ec5da..6c54136 100644
--- a/src/results_reporter.py
+++ b/src/results_reporter.py
@@ -102,6 +102,15 @@ def total_tokens(self) -> int:
             if result.token_usage:
                 total += result.token_usage.get("total_tokens", 0)
         return total
+    
+    @property
+    def total_reasoning_tokens(self) -> int:
+        """Calculate total reasoning tokens across all tasks."""
+        total = 0
+        for result in self.task_results:
+            if result.token_usage:
+                total += result.token_usage.get("reasoning_tokens", 0)
+        return total
 
     @property
     def avg_input_tokens(self) -> float:
@@ -123,6 +132,13 @@ def avg_total_tokens(self) -> float:
         if self.total_tasks == 0:
             return 0.0
         return self.total_tokens / self.total_tasks
+    
+    @property
+    def avg_reasoning_tokens(self) -> float:
+        """Calculate average reasoning tokens per task."""
+        if self.total_tasks == 0:
+            return 0.0
+        return self.total_reasoning_tokens / self.total_tasks
 
     @property
     def total_task_execution_time(self) -> float:
@@ -155,9 +171,11 @@ def get_category_stats(self) -> Dict[str, Dict[str, Any]]:
                     "total_input_tokens": 0,
                     "total_output_tokens": 0,
                     "total_tokens": 0,
+                    "total_reasoning_tokens": 0,
                     "avg_input_tokens": 0.0,
                     "avg_output_tokens": 0.0,
                     "avg_total_tokens": 0.0,
+                    "avg_reasoning_tokens": 0.0,
                     "total_turns": 0,
                     "avg_turns": 0.0,
                 }
@@ -179,6 +197,9 @@ def get_category_stats(self) -> Dict[str, Dict[str, Any]]:
                 category_stats[category]["total_tokens"] += result.token_usage.get(
                     "total_tokens", 0
                 )
+                category_stats[category]["total_reasoning_tokens"] += result.token_usage.get(
+                    "reasoning_tokens", 0
+                )
 
             # Accumulate turns
             if result.turn_count is not None:
@@ -206,6 +227,7 @@ def get_category_stats(self) -> Dict[str, Dict[str, Any]]:
                     stats["total_output_tokens"] / stats["total"]
                 )
                 stats["avg_total_tokens"] = stats["total_tokens"] / stats["total"]
+                stats["avg_reasoning_tokens"] = stats["total_reasoning_tokens"] / stats["total"]
 
                 stats["avg_turns"] = (
                     stats["total_turns"] / stats["total"] if stats["total"] > 0 else 0
@@ -241,7 +263,9 @@ def save_meta_json(
 
         meta_data = {
             "task_name": task_result.task_name,
-            "model": model_config.get("model_name", "unknown"),
+            "model_name": model_config.get("model_name", "unknown"),
+            "litellm_run_model_name": model_config.get("litellm_run_model_name"),
+            "reasoning_effort": model_config.get("reasoning_effort"),
             "mcp": model_config.get("mcp_service", "unknown"),
             "timeout": model_config.get("timeout", 300),
             "time": {"start": start_time.isoformat(), "end": end_time.isoformat()},
@@ -272,7 +296,8 @@ def save_model_summary(self, report: EvaluationReport, output_path: Path) -> Pat
         avg_turns = total_turns / report.total_tasks if report.total_tasks > 0 else 0
 
         summary = {
-            "model": report.model_name,
+            "model_name": report.model_name,
+            "model_config": report.model_config,
             "total_tasks": report.total_tasks,
             "successful_tasks": report.successful_tasks,
             "failed_tasks": report.failed_tasks,
@@ -289,9 +314,11 @@ def save_model_summary(self, report: EvaluationReport, output_path: Path) -> Pat
                 "total_input_tokens": report.total_input_tokens,
                 "total_output_tokens": report.total_output_tokens,
                 "total_tokens": report.total_tokens,
+                "total_reasoning_tokens": report.total_reasoning_tokens,
                 "avg_input_tokens": round(report.avg_input_tokens, 2),
                 "avg_output_tokens": round(report.avg_output_tokens, 2),
                 "avg_total_tokens": round(report.avg_total_tokens, 2),
+                "avg_reasoning_tokens": round(report.avg_reasoning_tokens, 2),
             },
             "turn_usage": {
                 "total_turns": total_turns,
@@ -306,9 +333,11 @@ def save_model_summary(self, report: EvaluationReport, output_path: Path) -> Pat
                         "total_input": stats["total_input_tokens"],
                         "total_output": stats["total_output_tokens"],
                         "total": stats["total_tokens"],
+                        "total_reasoning": stats["total_reasoning_tokens"],
                         "avg_input": round(stats["avg_input_tokens"], 2),
                         "avg_output": round(stats["avg_output_tokens"], 2),
                         "avg_total": round(stats["avg_total_tokens"], 2),
+                        "avg_reasoning": round(stats["avg_reasoning_tokens"], 2),
                     },
                     "turn_usage": {
                         "total_turns": stats["total_turns"],
diff --git a/tasks/notion/it_trouble_shooting_hub/verification_expired_update/meta.json b/tasks/notion/it_trouble_shooting_hub/verification_expired_update/meta.json
index c09a53d..a4602df 100644
--- a/tasks/notion/it_trouble_shooting_hub/verification_expired_update/meta.json
+++ b/tasks/notion/it_trouble_shooting_hub/verification_expired_update/meta.json
@@ -21,6 +21,6 @@
     "stateType": "url",
     "stateContent": null,
     "stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5",
-    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101"
+    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub"
   }
 }
\ No newline at end of file