eval-sys · arvinxx · Aug 18, 2025 · Aug 18, 2025 · Aug 18, 2025 · Aug 18, 2025
diff --git a/src/agent.py b/src/agent.py
@@ -540,8 +540,8 @@ async def execute(self, instruction: str) -> Dict[str, Any]:
             if is_retryable_error(result["error"]) and attempt < self.max_retries:
                 wait_seconds = get_retry_delay(attempt)
                 logger.warning(
-                    f"[Retry] Attempt {attempt}/{self.max_retries} failed. "
-                    f"Waiting {wait_seconds}s before retrying: {error_msg}"
+                    f"| [Retry] Attempt {attempt}/{self.max_retries} failed. "
+                    f"| Waiting {wait_seconds}s before retrying: {error_msg}"
                 )
                 await asyncio.sleep(wait_seconds)
                 continue  # Retry

diff --git a/src/base/state_manager.py b/src/base/state_manager.py
@@ -92,7 +92,7 @@ def clean_up(self, task: BaseTask = None) -> bool:
                 logger.info(f"| ✓ Cleanup completed for {self.service_name}")
             else:
                 logger.warning(
-                    f"⚠️ Cleanup completed with some failures for {self.service_name}"
+                    f"| Cleanup completed with some failures for {self.service_name}"
                 )
 
             return cleanup_success

diff --git a/src/mcp_services/playwright/playwright_state_manager.py b/src/mcp_services/playwright/playwright_state_manager.py
@@ -332,6 +332,24 @@ def close_all(self) -> None:
         except Exception as e:
             logger.error(f"Error closing browser resources: {e}")
 
+    def set_verification_environment(self, messages_path: str = None) -> None:
+        """
+        Set Playwright-specific environment variables for verification scripts.
+
+        Args:
+            messages_path: Optional path to messages.json file for verification
+        """
+        import os
+
+        # Set common MCP_MESSAGES if provided
+        if messages_path:
+            os.environ["MCP_MESSAGES"] = str(messages_path)
+            # Also set PLAYWRIGHT_WORK_DIR to the directory containing messages.json
+            work_dir = str(Path(messages_path).parent)
+            os.environ["PLAYWRIGHT_WORK_DIR"] = work_dir
+            logger.info(f"| Set PLAYWRIGHT_WORK_DIR to: {work_dir}")
+            logger.info(f"| Set MCP_MESSAGES to: {messages_path}")
+
     def __del__(self):
         """Ensure cleanup on deletion."""
         self.close_all()
diff --git a/src/mcp_services/playwright/playwright_task_manager.py b/src/mcp_services/playwright/playwright_task_manager.py
@@ -7,10 +7,24 @@
 """
 
 import sys
+import os
+import subprocess
 from pathlib import Path
 from typing import List, Dict, Any
 
 from src.base.task_manager import BaseTask, BaseTaskManager
+from src.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class PlaywrightTask(BaseTask):
+    """Playwright-specific task that uses directory name as task name."""
+
+    @property
+    def name(self) -> str:
+        """Return the task name in the format 'category/task_id' without forcing 'task_' prefix."""
+        return f"{self.category}/{self.task_id}"
 
 
 class PlaywrightTaskManager(BaseTaskManager):
@@ -24,24 +38,18 @@ def __init__(self, tasks_root: Path = None):
         super().__init__(
             tasks_root,
             mcp_service="playwright",
-            task_class=BaseTask,
+            task_class=PlaywrightTask,
             task_organization="directory",
         )
 
     def _create_task_from_files(
         self, category_name: str, task_files_info: Dict[str, Any]
-    ) -> BaseTask:
-        """Instantiate a `BaseTask` from the dictionary returned by `_find_task_files`."""
-        # Extract numeric ID from folder name like "task_1" so that the default
-        # `BaseTask.name` ("{category}/task_{task_id}") matches the original path
-        # pattern used by the CLI filter, e.g. "form_interaction/task_1".
-        try:
-            task_id = int(task_files_info["task_name"].split("_")[1])
-        except (IndexError, ValueError):
-            # Fallback to entire slug when it is not in the expected format
-            task_id = task_files_info["task_name"]
-
-        return BaseTask(
+    ) -> PlaywrightTask:
+        """Instantiate a `PlaywrightTask` from the dictionary returned by `_find_task_files`."""
+        # Use the directory name directly as task_id for cleaner task names
+        task_id = task_files_info["task_name"]
+
+        return PlaywrightTask(
             task_instruction_path=task_files_info["instruction_path"],
             task_verification_path=task_files_info["verification_path"],
             service="playwright",
@@ -53,6 +61,30 @@ def _get_verification_command(self, task: BaseTask) -> List[str]:
         """Get verification command - just run the verify.py script."""
         return [sys.executable, str(task.task_verification_path)]
 
+    def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
+        """Run verification with Playwright-specific environment."""
+        env = os.environ.copy()
+
+        # Pass messages.json path and working directory to verification script
+        messages_path = os.getenv("MCP_MESSAGES")
+        work_dir = os.getenv("PLAYWRIGHT_WORK_DIR")
+
+        if messages_path:
+            env["MCP_MESSAGES"] = messages_path
+            logger.debug(f"Setting MCP_MESSAGES to: {messages_path}")
+
+        if work_dir:
+            env["PLAYWRIGHT_WORK_DIR"] = work_dir
+            logger.debug(f"Setting PLAYWRIGHT_WORK_DIR to: {work_dir}")
+
+        return subprocess.run(
+            self._get_verification_command(task),
+            capture_output=True,
+            text=True,
+            timeout=90,
+            env=env,
+        )
+
     def _format_task_instruction(self, base_instruction: str) -> str:
         """Add Playwright-specific note to instructions."""
         return (

diff --git a/src/services.py b/src/services.py
@@ -233,7 +233,8 @@
             },
             "password": {
                 "env_var": "POSTGRES_PASSWORD",
-                "required": True,
+                "default": "123456",
+                "required": False,
                 "description": "PostgreSQL password",
             },
         },

diff --git a/...turnstile_challenge/task_1/description.md → ...dflare_turnstile_challenge/description.md b/...turnstile_challenge/task_1/description.md → ...dflare_turnstile_challenge/description.md
diff --git a/...lare_turnstile_challenge/task_1/verify.py → .../cloudflare_turnstile_challenge/verify.py b/...lare_turnstile_challenge/task_1/verify.py → .../cloudflare_turnstile_challenge/verify.py
diff --git a/tasks/playwright/eval_web/extraction_table/data.csv b/tasks/playwright/eval_web/extraction_table/data.csv
@@ -0,0 +1,98 @@
+Title, Rating, Likes, Views, Replies
+React 18 New Features Deep Dive, "4.8", 856, 12543, 89
+Vue 3 Composition API in Practice, "4.5", 743, 9876, 67
+Advanced TypeScript Types Guide, "4.9", 924, 15432, 102
+Node.js Performance Optimization, "4.2", 567, 8765, 45
+Frontend Engineering Best Practices, "4.7", 812, 11234, 78
+Microservices Architecture Patterns, "4.3", 634, 9543, 56
+Docker Containerization Deployment, "4.6", 789, 10876, 71
+Kubernetes Cluster Management, "4.4", 698, 9234, 63
+GraphQL API Design Principles, "4.8", 876, 13456, 94
+Webpack 5 Configuration Guide, "4.1", 523, 7654, 38
+Vite Build Tool Usage, "4.5", 745, 10123, 69
+ESLint Code Standards, "4.7", 823, 11567, 82
+Unit Testing Best Practices, "4.3", 612, 8934, 51
+Performance Monitoring & Optimization, "4.9", 945, 16234, 108
+Security Protection Strategies, "4.2", 578, 8456, 47
+Database Design Principles, "4.6", 767, 10567, 73
+Caching Strategies Implementation, "4.4", 689, 9123, 61
+Message Queue Applications, "4.8", 834, 12876, 87
+Distributed Systems Design, "4.0", 456, 6789, 34
+Cloud Native Development, "4.5", 723, 9876, 65
+DevOps Process Optimization, "4.7", 801, 11234, 79
+Machine Learning Introduction, "4.1", 534, 7543, 41
+Artificial Intelligence Applications, "4.6", 778, 10456, 74
+Blockchain Technology Fundamentals, "4.3", 645, 8765, 53
+Mobile Development Techniques, "4.9", 912, 14567, 97
+Cross-Platform Solutions, "4.2", 589, 8234, 48
+Progressive Web App Development, "4.8", 867, 12345, 91
+Web3 Development Guide, "4.4", 712, 9567, 64
+NFT Smart Contracts, "4.5", 756, 10234, 70
+DeFi Protocol Design, "4.7", 834, 11876, 83
+Game Engine Development, "4.3", 623, 8567, 52
+3D Graphics Rendering, "4.6", 789, 10678, 75
+Audio Video Processing, "4.1", 545, 7234, 42
+IoT Applications, "4.8", 856, 12567, 88
+Edge Computing Practices, "4.2", 567, 8345, 46
+5G Network Technology, "4.9", 923, 15123, 103
+Quantum Computing Principles, "4.4", 678, 9345, 62
+Bioinformatics Analysis, "4.5", 734, 9876, 68
+Data Science Methods, "4.7", 812, 11456, 80
+Algorithms and Data Structures, "4.3", 634, 8678, 54
+System Design Interview, "4.6", 778, 10345, 76
+Code Refactoring Techniques, "4.8", 845, 12234, 89
+Open Source Contributions, "4.2", 556, 7890, 43
+Technical Team Management, "4.5", 723, 9567, 66
+Product Thinking Development, "4.9", 901, 14234, 95
+User Experience Design, "4.1", 512, 7123, 39
+Interface Interaction Optimization, "4.7", 789, 10890, 77
+Accessibility Design, "4.4", 667, 8901, 58
+SEO Optimization Strategies, "4.6", 756, 10123, 72
+Social Media Operations, "4.3", 623, 8456, 55
+Serverless Architecture, "4.7", 834, 11234, 81
+API Gateway Design, "4.2", 567, 8765, 49
+Microservice Communication, "4.8", 892, 13567, 95
+Event-Driven Architecture, "4.5", 723, 9876, 67
+CQRS Pattern Implementation, "4.3", 645, 8234, 54
+Domain-Driven Design, "4.6", 778, 10456, 73
+Clean Architecture Principles, "4.4", 689, 9123, 62
+Hexagonal Architecture, "4.1", 534, 7543, 42
+Onion Architecture, "4.5", 712, 9567, 65
+Event Sourcing Patterns, "4.7", 823, 11876, 79
+Saga Pattern for Distributed Systems, "4.3", 612, 8934, 53
+Circuit Breaker Pattern, "4.8", 856, 12543, 87
+Bulkhead Pattern, "4.2", 578, 8456, 47
+Retry Pattern Implementation, "4.6", 767, 10567, 74
+Timeout Pattern, "4.4", 698, 9234, 63
+Rate Limiting Strategies, "4.9", 934, 15432, 103
+Load Balancing Techniques, "4.1", 523, 7654, 39
+Service Mesh Architecture, "4.5", 745, 10123, 69
+Istio Service Mesh, "4.7", 812, 11567, 82
+Envoy Proxy Configuration, "4.3", 634, 9543, 56
+Consul Service Discovery, "4.6", 789, 10876, 71
+Kubernetes Ingress, "4.4", 676, 9345, 58
+Helm Chart Development, "4.8", 845, 12234, 89
+Terraform Infrastructure, "4.2", 556, 7890, 44
+Ansible Automation, "4.5", 723, 9567, 66
+Jenkins Pipeline, "4.7", 801, 11234, 78
+GitLab CI/CD, "4.3", 623, 8567, 52
+GitHub Actions, "4.6", 789, 10678, 75
+Azure DevOps, "4.1", 512, 7123, 41
+AWS CodePipeline, "4.8", 867, 12345, 91
+Docker Compose, "4.4", 712, 9567, 64
+Kubernetes Operators, "4.5", 756, 10234, 70
+Custom Resource Definitions, "4.7", 834, 11876, 83
+Pod Security Policies, "4.3", 623, 8567, 52
+Network Policies, "4.6", 789, 10678, 75
+RBAC Configuration, "4.1", 545, 7234, 42
+Secret Management, "4.8", 856, 12567, 88
+ConfigMap Usage, "4.2", 567, 8345, 46
+Persistent Volumes, "4.9", 923, 15123, 103
+StatefulSets, "4.4", 678, 9345, 62
+DaemonSets, "4.5", 734, 9876, 68
+Jobs and CronJobs, "4.7", 812, 11456, 80
+Horizontal Pod Autoscaler, "4.3", 634, 8678, 54
+Vertical Pod Autoscaler, "4.6", 778, 10345, 76
+Cluster Autoscaler, "4.8", 845, 12234, 89
+Resource Quotas, "4.2", 556, 7890, 43
+Limit Ranges, "4.5", 723, 9567, 66
diff --git a/tasks/playwright/eval_web/extraction_table/description.md b/tasks/playwright/eval_web/extraction_table/description.md
@@ -0,0 +1,37 @@
+# Web Data Extraction Task
+
+Use Playwright MCP tools to extract all data from the specified website and present it in CSV format.
+
+## Requirements:
+
+1. Navigate to https://eval-web.mcpmark.ai/extraction
+2. Wait for the page to fully load
+3. Extract all data content from the page, including:
+   - Title
+   - Rating
+   - Likes
+   - Views
+   - Replies
+4. Organize the extracted data into CSV format
+5. Ensure data completeness and accuracy
+6. Output ONLY the complete CSV formatted data (no additional text or explanations)
+
+## CSV Data Example:
+
+```csv
+Title, Rating, Likes, Views, Replies
+SEO Optimization, "4.6", 756, 10123, 72
+Vue 3 Composition API, "4.5", 743, 9876, 67
+Advanced TypeScript Types Guide, "4.9", 924, 15432, 102
+Node.js Performance Optimization, "4.2", 567, 8765, 45
+Frontend Engineering Best Practices, "4.7", 812, 11234, 78
+```
+
+## Notes:
+
+- Ensure extraction of all visible data rows
+- Maintain data format consistency
+- All numeric data (Rating, Likes, Views, Replies) should NOT have quotes, only text data containing commas should be wrapped in quotes
+- Wait for the page to fully load before starting data extraction
+- Verify the quantity and format of extracted data are correct
+- **IMPORTANT: Final output must contain ONLY CSV data - no explanatory text, descriptions, or other content**