|
3 | 3 | from platform-labs-agent-eval-harness. |
4 | 4 | """ |
5 | 5 |
|
6 | | -import asyncio |
7 | | -import importlib |
8 | 6 | import json |
9 | 7 | import logging |
10 | | -from datetime import datetime |
| 8 | +import tempfile |
11 | 9 | from pathlib import Path |
12 | | -from typing import Any, Dict, Optional |
13 | 10 |
|
14 | 11 | import playwright |
15 | | -from agent_eval_harness_common.models import AllocationResource, WebsiteRequirement |
16 | | -from playwright.async_api import async_playwright |
17 | 12 |
|
18 | 13 | from browsergym.webarena.instance import WebArenaInstance |
19 | 14 | from webarena_verified.api.evaluator_api import TaskEvaluator |
@@ -75,14 +70,27 @@ def __call__( |
75 | 70 | Returns: |
76 | 71 | Float score compatible with BrowserGym (1.0 or 0.0) |
77 | 72 | """ |
| 73 | + # import webarena dynamically |
| 74 | + from webarena.browser_env.actions import ActionTypes |
| 75 | + # if last action is not a STOP action, return 0.0 as the task is not completed yet |
| 76 | + if trajectory[-1].get("action_type") != ActionTypes.STOP: |
| 77 | + return 0.0 |
| 78 | + |
| 79 | + # task is done: load the config file, stop playwright tracing, and evaluate the trace |
78 | 80 | with open(config_file, "r") as f: |
79 | | - config = json.load(f) |
| 81 | + config_raw = json.load(f) |
| 82 | + config: WebArenaVerifiedTask = WebArenaVerifiedTask.model_validate(config_raw) |
| 83 | + |
| 84 | + # stop playwright tracing |
| 85 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 86 | + trace_path = Path(temp_dir) / f"wav_{config.task_id}.zip" |
| 87 | + page.context.tracing.stop(path=trace_path) |
80 | 88 |
|
81 | 89 | # create eval request |
82 | 90 | eval_request = WebarenaTaskEvalRequest( |
83 | | - task=WebArenaVerifiedTask.model_validate(config), |
| 91 | + task=config, |
84 | 92 | agent_response_raw=trajectory[-1].get("answer"), |
85 | | - network_trace=NetworkTrace.from_playwright_trace(...), # TODO: add path to playwright trace should be Path(exp_args.exp_dir / "pw_traces" / f"{exp_args.exp_name}.zip") |
| 93 | + network_trace=NetworkTrace.from_playwright_trace(trace_path), # TODO: add path to playwright trace should be Path(exp_args.exp_dir / "pw_traces" / f"{exp_args.exp_name}.zip") |
86 | 94 | ) |
87 | 95 |
|
88 | 96 | # Run wa_verified evaluation and return float score |
|
0 commit comments