Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,8 +540,8 @@ async def execute(self, instruction: str) -> Dict[str, Any]:
if is_retryable_error(result["error"]) and attempt < self.max_retries:
wait_seconds = get_retry_delay(attempt)
logger.warning(
f"[Retry] Attempt {attempt}/{self.max_retries} failed. "
f"Waiting {wait_seconds}s before retrying: {error_msg}"
f"| [Retry] Attempt {attempt}/{self.max_retries} failed. "
f"| Waiting {wait_seconds}s before retrying: {error_msg}"
)
await asyncio.sleep(wait_seconds)
continue # Retry
Expand Down
2 changes: 1 addition & 1 deletion src/base/state_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def clean_up(self, task: BaseTask = None) -> bool:
logger.info(f"| ✓ Cleanup completed for {self.service_name}")
else:
logger.warning(
f"⚠️ Cleanup completed with some failures for {self.service_name}"
f"| Cleanup completed with some failures for {self.service_name}"
)

return cleanup_success
Expand Down
18 changes: 18 additions & 0 deletions src/mcp_services/playwright/playwright_state_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,24 @@ def close_all(self) -> None:
except Exception as e:
logger.error(f"Error closing browser resources: {e}")

def set_verification_environment(self, messages_path: str = None) -> None:
"""
Set Playwright-specific environment variables for verification scripts.

Args:
messages_path: Optional path to messages.json file for verification
"""
import os

# Set common MCP_MESSAGES if provided
if messages_path:
os.environ["MCP_MESSAGES"] = str(messages_path)
# Also set PLAYWRIGHT_WORK_DIR to the directory containing messages.json
work_dir = str(Path(messages_path).parent)
os.environ["PLAYWRIGHT_WORK_DIR"] = work_dir
logger.info(f"| Set PLAYWRIGHT_WORK_DIR to: {work_dir}")
logger.info(f"| Set MCP_MESSAGES to: {messages_path}")

def __del__(self):
"""Ensure cleanup on deletion."""
self.close_all()
58 changes: 45 additions & 13 deletions src/mcp_services/playwright/playwright_task_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,24 @@
"""

import sys
import os
import subprocess
from pathlib import Path
from typing import List, Dict, Any

from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger

logger = get_logger(__name__)


class PlaywrightTask(BaseTask):
"""Playwright-specific task that uses directory name as task name."""

@property
def name(self) -> str:
"""Return the task name in the format 'category/task_id' without forcing 'task_' prefix."""
return f"{self.category}/{self.task_id}"


class PlaywrightTaskManager(BaseTaskManager):
Expand All @@ -24,24 +38,18 @@ def __init__(self, tasks_root: Path = None):
super().__init__(
tasks_root,
mcp_service="playwright",
task_class=BaseTask,
task_class=PlaywrightTask,
task_organization="directory",
)

def _create_task_from_files(
self, category_name: str, task_files_info: Dict[str, Any]
) -> BaseTask:
"""Instantiate a `BaseTask` from the dictionary returned by `_find_task_files`."""
# Extract numeric ID from folder name like "task_1" so that the default
# `BaseTask.name` ("{category}/task_{task_id}") matches the original path
# pattern used by the CLI filter, e.g. "form_interaction/task_1".
try:
task_id = int(task_files_info["task_name"].split("_")[1])
except (IndexError, ValueError):
# Fallback to entire slug when it is not in the expected format
task_id = task_files_info["task_name"]

return BaseTask(
) -> PlaywrightTask:
"""Instantiate a `PlaywrightTask` from the dictionary returned by `_find_task_files`."""
# Use the directory name directly as task_id for cleaner task names
task_id = task_files_info["task_name"]

return PlaywrightTask(
task_instruction_path=task_files_info["instruction_path"],
task_verification_path=task_files_info["verification_path"],
service="playwright",
Expand All @@ -53,6 +61,30 @@ def _get_verification_command(self, task: BaseTask) -> List[str]:
"""Get verification command - just run the verify.py script."""
return [sys.executable, str(task.task_verification_path)]

def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
"""Run verification with Playwright-specific environment."""
env = os.environ.copy()

# Pass messages.json path and working directory to verification script
messages_path = os.getenv("MCP_MESSAGES")
work_dir = os.getenv("PLAYWRIGHT_WORK_DIR")

if messages_path:
env["MCP_MESSAGES"] = messages_path
logger.debug(f"Setting MCP_MESSAGES to: {messages_path}")

if work_dir:
env["PLAYWRIGHT_WORK_DIR"] = work_dir
logger.debug(f"Setting PLAYWRIGHT_WORK_DIR to: {work_dir}")

return subprocess.run(
self._get_verification_command(task),
capture_output=True,
text=True,
timeout=90,
env=env,
)

def _format_task_instruction(self, base_instruction: str) -> str:
"""Add Playwright-specific note to instructions."""
return (
Expand Down
3 changes: 2 additions & 1 deletion src/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@
},
"password": {
"env_var": "POSTGRES_PASSWORD",
"required": True,
"default": "123456",
"required": False,
"description": "PostgreSQL password",
},
},
Expand Down
98 changes: 98 additions & 0 deletions tasks/playwright/eval_web/extraction_table/data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
Title, Rating, Likes, Views, Replies
React 18 New Features Deep Dive, "4.8", 856, 12543, 89
Vue 3 Composition API in Practice, "4.5", 743, 9876, 67
Advanced TypeScript Types Guide, "4.9", 924, 15432, 102
Node.js Performance Optimization, "4.2", 567, 8765, 45
Frontend Engineering Best Practices, "4.7", 812, 11234, 78
Microservices Architecture Patterns, "4.3", 634, 9543, 56
Docker Containerization Deployment, "4.6", 789, 10876, 71
Kubernetes Cluster Management, "4.4", 698, 9234, 63
GraphQL API Design Principles, "4.8", 876, 13456, 94
Webpack 5 Configuration Guide, "4.1", 523, 7654, 38
Vite Build Tool Usage, "4.5", 745, 10123, 69
ESLint Code Standards, "4.7", 823, 11567, 82
Unit Testing Best Practices, "4.3", 612, 8934, 51
Performance Monitoring & Optimization, "4.9", 945, 16234, 108
Security Protection Strategies, "4.2", 578, 8456, 47
Database Design Principles, "4.6", 767, 10567, 73
Caching Strategies Implementation, "4.4", 689, 9123, 61
Message Queue Applications, "4.8", 834, 12876, 87
Distributed Systems Design, "4.0", 456, 6789, 34
Cloud Native Development, "4.5", 723, 9876, 65
DevOps Process Optimization, "4.7", 801, 11234, 79
Machine Learning Introduction, "4.1", 534, 7543, 41
Artificial Intelligence Applications, "4.6", 778, 10456, 74
Blockchain Technology Fundamentals, "4.3", 645, 8765, 53
Mobile Development Techniques, "4.9", 912, 14567, 97
Cross-Platform Solutions, "4.2", 589, 8234, 48
Progressive Web App Development, "4.8", 867, 12345, 91
Web3 Development Guide, "4.4", 712, 9567, 64
NFT Smart Contracts, "4.5", 756, 10234, 70
DeFi Protocol Design, "4.7", 834, 11876, 83
Game Engine Development, "4.3", 623, 8567, 52
3D Graphics Rendering, "4.6", 789, 10678, 75
Audio Video Processing, "4.1", 545, 7234, 42
IoT Applications, "4.8", 856, 12567, 88
Edge Computing Practices, "4.2", 567, 8345, 46
5G Network Technology, "4.9", 923, 15123, 103
Quantum Computing Principles, "4.4", 678, 9345, 62
Bioinformatics Analysis, "4.5", 734, 9876, 68
Data Science Methods, "4.7", 812, 11456, 80
Algorithms and Data Structures, "4.3", 634, 8678, 54
System Design Interview, "4.6", 778, 10345, 76
Code Refactoring Techniques, "4.8", 845, 12234, 89
Open Source Contributions, "4.2", 556, 7890, 43
Technical Team Management, "4.5", 723, 9567, 66
Product Thinking Development, "4.9", 901, 14234, 95
User Experience Design, "4.1", 512, 7123, 39
Interface Interaction Optimization, "4.7", 789, 10890, 77
Accessibility Design, "4.4", 667, 8901, 58
SEO Optimization Strategies, "4.6", 756, 10123, 72
Social Media Operations, "4.3", 623, 8456, 55
Serverless Architecture, "4.7", 834, 11234, 81
API Gateway Design, "4.2", 567, 8765, 49
Microservice Communication, "4.8", 892, 13567, 95
Event-Driven Architecture, "4.5", 723, 9876, 67
CQRS Pattern Implementation, "4.3", 645, 8234, 54
Domain-Driven Design, "4.6", 778, 10456, 73
Clean Architecture Principles, "4.4", 689, 9123, 62
Hexagonal Architecture, "4.1", 534, 7543, 42
Onion Architecture, "4.5", 712, 9567, 65
Event Sourcing Patterns, "4.7", 823, 11876, 79
Saga Pattern for Distributed Systems, "4.3", 612, 8934, 53
Circuit Breaker Pattern, "4.8", 856, 12543, 87
Bulkhead Pattern, "4.2", 578, 8456, 47
Retry Pattern Implementation, "4.6", 767, 10567, 74
Timeout Pattern, "4.4", 698, 9234, 63
Rate Limiting Strategies, "4.9", 934, 15432, 103
Load Balancing Techniques, "4.1", 523, 7654, 39
Service Mesh Architecture, "4.5", 745, 10123, 69
Istio Service Mesh, "4.7", 812, 11567, 82
Envoy Proxy Configuration, "4.3", 634, 9543, 56
Consul Service Discovery, "4.6", 789, 10876, 71
Kubernetes Ingress, "4.4", 676, 9345, 58
Helm Chart Development, "4.8", 845, 12234, 89
Terraform Infrastructure, "4.2", 556, 7890, 44
Ansible Automation, "4.5", 723, 9567, 66
Jenkins Pipeline, "4.7", 801, 11234, 78
GitLab CI/CD, "4.3", 623, 8567, 52
GitHub Actions, "4.6", 789, 10678, 75
Azure DevOps, "4.1", 512, 7123, 41
AWS CodePipeline, "4.8", 867, 12345, 91
Docker Compose, "4.4", 712, 9567, 64
Kubernetes Operators, "4.5", 756, 10234, 70
Custom Resource Definitions, "4.7", 834, 11876, 83
Pod Security Policies, "4.3", 623, 8567, 52
Network Policies, "4.6", 789, 10678, 75
RBAC Configuration, "4.1", 545, 7234, 42
Secret Management, "4.8", 856, 12567, 88
ConfigMap Usage, "4.2", 567, 8345, 46
Persistent Volumes, "4.9", 923, 15123, 103
StatefulSets, "4.4", 678, 9345, 62
DaemonSets, "4.5", 734, 9876, 68
Jobs and CronJobs, "4.7", 812, 11456, 80
Horizontal Pod Autoscaler, "4.3", 634, 8678, 54
Vertical Pod Autoscaler, "4.6", 778, 10345, 76
Cluster Autoscaler, "4.8", 845, 12234, 89
Resource Quotas, "4.2", 556, 7890, 43
Limit Ranges, "4.5", 723, 9567, 66
37 changes: 37 additions & 0 deletions tasks/playwright/eval_web/extraction_table/description.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Web Data Extraction Task

Use Playwright MCP tools to extract all data from the specified website and present it in CSV format.

## Requirements:

1. Navigate to https://eval-web.mcpmark.ai/extraction
2. Wait for the page to fully load
3. Extract all data content from the page, including:
- Title
- Rating
- Likes
- Views
- Replies
4. Organize the extracted data into CSV format
5. Ensure data completeness and accuracy
6. Output ONLY the complete CSV formatted data (no additional text or explanations)

## CSV Data Example:

```csv
Title, Rating, Likes, Views, Replies
SEO Optimization, "4.6", 756, 10123, 72
Vue 3 Composition API, "4.5", 743, 9876, 67
Advanced TypeScript Types Guide, "4.9", 924, 15432, 102
Node.js Performance Optimization, "4.2", 567, 8765, 45
Frontend Engineering Best Practices, "4.7", 812, 11234, 78
```

## Notes:

- Ensure extraction of all visible data rows
- Maintain data format consistency
- All numeric data (Rating, Likes, Views, Replies) should NOT have quotes, only text data containing commas should be wrapped in quotes
- Wait for the page to fully load before starting data extraction
- Verify the quantity and format of extracted data are correct
- **IMPORTANT: Final output must contain ONLY CSV data - no explanatory text, descriptions, or other content**
Loading