Skip to content

Commit 9e779f0

Browse files
authored
Added TimeWarp: A cool new web agent benchmark on multiple temporal versions of web environments (#385)
* Added TimeWarp * Clarify environment setup in README Updated README to clarify environment setup instructions. * shifted timewarp's code to source repo * Removed timewarp from .toml to dev/requirements.txt
1 parent 594c9d4 commit 9e779f0

File tree

11 files changed

+296
-2
lines changed

11 files changed

+296
-2
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ BrowserGym includes the following benchmarks by default:
4545
- [AssistantBench](https://github.com/oriyor/assistantbench)
4646
- [WebLINX](https://github.com/McGill-NLP/weblinx) (static benchmark)
4747
- [OpenApps](https://facebookresearch.github.io/OpenApps/)
48+
- [TimeWarp](https://timewarp-web.github.io)
4849

4950
Designing new web benchmarks with BrowserGym is easy, and simply requires to inherit the [`AbstractBrowserTask`](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/core/src/browsergym/core/task.py#L7C7-L7C26) class.
5051

@@ -62,6 +63,7 @@ pip install browsergym-visualwebarena # core + visualwebarena
6263
pip install browsergym-workarena # core + workarena
6364
pip install browsergym-assistantbench # core + assistantbench
6465
pip install weblinx-browsergym # core + weblinx
66+
pip install browsergym-timewarp # core + timewarp
6567
```
6668

6769
Then setup playwright by running
@@ -77,6 +79,7 @@ Finally, each benchmark comes with its own specific setup that requires to follo
7779
- for WorkArena, see [WorkArena](https://github.com/ServiceNow/WorkArena)
7880
- for AssistantBench, see [assistantbench/README.md](browsergym/assistantbench/README.md)
7981
- for OpenApps, see [OpenApps docs](https://facebookresearch.github.io/OpenApps/)
82+
- for TimeWarp, see [timewarp/README.md](https://github.com/sparklabutah/timewarp)
8083

8184
### 🏗️ Development setup
8285

@@ -194,6 +197,20 @@ launcher = OpenAppsLauncher(config)
194197
launcher.launch()
195198
```
196199

200+
TimeWarp
201+
```python
202+
import gymnasium as gym
203+
import browsergym.timewarp # register timewarp tasks as gym environments
204+
205+
# start a timewarp task
206+
env = gym.make("browsergym/timewarp.1")
207+
...
208+
209+
# list all the available timewarp tasks
210+
env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/timewarp")]
211+
print("\n".join(env_ids))
212+
```
213+
197214
## 💻 Demo
198215

199216
If you want to experiment with a demo agent in BrowserGym, follow these steps

browsergym/core/src/browsergym/core/action/highlevel.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,19 @@
246246
goto, # GOTO, SEARCH
247247
send_msg_to_user, # TERMINATE
248248
],
249+
"timewarp": [ # Starting from an action set identical to assistantbench
250+
scroll, # SCROLL
251+
fill, # TYPE
252+
click, # CLICK
253+
press, # PRESS ENTER
254+
go_back, # GOBACK
255+
goto, # GOTO, SEARCH
256+
send_msg_to_user, # TERMINATE, sends message to user
257+
report_infeasible, # explicit unachievable action, equivalent STOP "N/A"
258+
new_tab, # Creates a new tab
259+
tab_close, # Closes a tab
260+
tab_focus, # Switches tabs
261+
],
249262
}
250263

251264

@@ -277,6 +290,7 @@ class HighLevelActionSet(AbstractActionSet):
277290
"workarena++",
278291
"weblinx",
279292
"assistantbench",
293+
"timewarp",
280294
"custom",
281295
]
282296
DemoMode = typing.Literal["off", "default", "all_blue", "only_visible_elements"]

browsergym/experiments/pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,17 @@ assistantbench = [
4343
weblinx = [
4444
"weblinx_browsergym",
4545
]
46+
timewarp = [
47+
"browsergym-timewarp"
48+
]
4649
all = [
4750
"browsergym-experiment[miniwob]",
4851
"browsergym-experiment[workarena]",
4952
"browsergym-experiment[webarena]",
5053
"browsergym-experiment[visualwebarena]",
5154
"browsergym-experiment[assistantbench]",
5255
"browsergym-experiment[weblinx]",
56+
"browsergym-experiment[timewarp]",
5357
]
5458

5559
[project.urls]

browsergym/experiments/src/browsergym/experiments/benchmark/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def make_action_set(self):
6060
"workarena",
6161
"assistantbench",
6262
"weblinx",
63+
"timewarp",
6364
]
6465

6566

browsergym/experiments/src/browsergym/experiments/benchmark/configs.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,13 @@
8787
retry_with_force=True,
8888
demo_mode="off",
8989
),
90+
"timewarp": HighLevelActionSetArgs(
91+
subsets=["timewarp"],
92+
multiaction=False,
93+
strict=False,
94+
retry_with_force=True,
95+
demo_mode="off",
96+
),
9097
}
9198

9299
# all benchmarks are callables designed for lazy loading, i.e. `bench = DEFAULT_BENCHMARKS["miniwob_all"]()`
@@ -291,4 +298,18 @@
291298
),
292299
task_metadata=task_metadata("weblinx"),
293300
),
301+
"timewarp": lambda n_repeats=1: Benchmark(
302+
name="timewarp",
303+
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["timewarp"],
304+
is_multi_tab=True,
305+
supports_parallel_seeds=True,
306+
backends=["timewarp"],
307+
env_args_list=make_env_args_list_from_repeat_tasks(
308+
task_list=task_list_from_metadata(metadata=task_metadata("timewarp")),
309+
max_steps=30,
310+
n_repeats=n_repeats,
311+
seeds_rng=np.random.RandomState(42),
312+
),
313+
task_metadata=task_metadata("timewarp"),
314+
),
294315
}
Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
task_name,sites,eval_types,task_id,browsergym_split
2+
timewarp.1,wiki,llm_judge,1,test
3+
timewarp.2,wiki,llm_judge,2,test
4+
timewarp.3,wiki,llm_judge,3,test
5+
timewarp.4,wiki,llm_judge,4,test
6+
timewarp.5,wiki,llm_judge,5,test
7+
timewarp.6,wiki,llm_judge,6,test
8+
timewarp.7,wiki,llm_judge,7,test
9+
timewarp.8,wiki,llm_judge,8,test
10+
timewarp.9,wiki,llm_judge,9,test
11+
timewarp.10,wiki,llm_judge,10,test
12+
timewarp.11,wiki,llm_judge,11,test
13+
timewarp.12,wiki,llm_judge,12,test
14+
timewarp.13,wiki,llm_judge,13,test
15+
timewarp.14,wiki,llm_judge,14,test
16+
timewarp.15,wiki,llm_judge,15,test
17+
timewarp.16,wiki,llm_judge,16,test
18+
timewarp.17,wiki,llm_judge,17,test
19+
timewarp.18,wiki,llm_judge,18,test
20+
timewarp.19,wiki,llm_judge,19,test
21+
timewarp.20,wiki,llm_judge,20,test
22+
timewarp.21,wiki,llm_judge,21,test
23+
timewarp.22,wiki,llm_judge,22,test
24+
timewarp.23,wiki,llm_judge,23,test
25+
timewarp.24,wiki,llm_judge,24,test
26+
timewarp.25,wiki,llm_judge,25,test
27+
timewarp.26,wiki,llm_judge,26,test
28+
timewarp.27,wiki,llm_judge,27,test
29+
timewarp.28,wiki,llm_judge,28,test
30+
timewarp.29,wiki,llm_judge,29,test
31+
timewarp.30,wiki,llm_judge,30,test
32+
timewarp.31,wiki,llm_judge,31,test
33+
timewarp.32,news,llm_judge,32,test
34+
timewarp.33,news,llm_judge,33,test
35+
timewarp.34,news,llm_judge,34,test
36+
timewarp.35,news,llm_judge,35,test
37+
timewarp.36,news,llm_judge,36,test
38+
timewarp.37,news,llm_judge,37,test
39+
timewarp.38,news,llm_judge,38,test
40+
timewarp.39,news,llm_judge,39,test
41+
timewarp.40,news,llm_judge,40,test
42+
timewarp.41,news,llm_judge,41,test
43+
timewarp.42,news,llm_judge,42,test
44+
timewarp.43,news,llm_judge,43,test
45+
timewarp.44,news,llm_judge,44,test
46+
timewarp.45,news,llm_judge,45,test
47+
timewarp.46,news,llm_judge,46,test
48+
timewarp.47,news,llm_judge,47,test
49+
timewarp.48,news,llm_judge,48,test
50+
timewarp.49,news,llm_judge,49,test
51+
timewarp.50,news,llm_judge,50,test
52+
timewarp.51,news,llm_judge,51,test
53+
timewarp.52,news,llm_judge,52,test
54+
timewarp.53,news,llm_judge,53,test
55+
timewarp.54,webshop,llm_judge,54,test
56+
timewarp.55,webshop,llm_judge,55,test
57+
timewarp.56,webshop,llm_judge,56,test
58+
timewarp.57,webshop,llm_judge,57,test
59+
timewarp.58,webshop,llm_judge,58,test
60+
timewarp.59,webshop,llm_judge,59,test
61+
timewarp.60,webshop,llm_judge,60,test
62+
timewarp.61,webshop,llm_judge,61,test
63+
timewarp.62,webshop,llm_judge,62,test
64+
timewarp.63,webshop,llm_judge,63,test
65+
timewarp.64,webshop,llm_judge,64,test
66+
timewarp.65,webshop,llm_judge,65,test
67+
timewarp.66,webshop,llm_judge,66,test
68+
timewarp.67,webshop,llm_judge,67,test
69+
timewarp.68,webshop,llm_judge,68,test
70+
timewarp.69,webshop,llm_judge,69,test
71+
timewarp.70,webshop,llm_judge,70,test
72+
timewarp.71,webshop,llm_judge,71,test
73+
timewarp.72,webshop,llm_judge,72,test
74+
timewarp.73,webshop,llm_judge,73,test
75+
timewarp.74,webshop,llm_judge,74,test
76+
timewarp.75,webshop,llm_judge,75,test
77+
timewarp.76,webshop,llm_judge,76,test
78+
timewarp.77,webshop,llm_judge,77,test
79+
timewarp.78,webshop,llm_judge,78,test
80+
timewarp.79,webshop,llm_judge,79,test
81+
timewarp.80,webshop,llm_judge,80,test
82+
timewarp.81,"wiki,news",llm_judge,81,test
83+
timewarp.82,"wiki,news",llm_judge,82,test
84+
timewarp.83,"wiki,webshop",llm_judge,83,test
85+
timewarp.84,"wiki,news,webshop",llm_judge,84,test
86+
timewarp.85,"news,webshop",llm_judge,85,test
87+
timewarp.86,"wiki,news",llm_judge,86,test
88+
timewarp.87,"wiki,webshop",llm_judge,87,test
89+
timewarp.88,"news,webshop",llm_judge,88,test
90+
timewarp.89,"wiki,webshop",llm_judge,89,test
91+
timewarp.90,"wiki,webshop",llm_judge,90,test
92+
timewarp.91,"wiki,news",llm_judge,91,test
93+
timewarp.92,"wiki,news",llm_judge,92,test
94+
timewarp.93,"wiki,news",llm_judge,93,test
95+
timewarp.94,"wiki,news,webshop",llm_judge,94,test
96+
timewarp.95,"wiki,news,webshop",llm_judge,95,test
97+
timewarp.96,"news,webshop",llm_judge,96,test
98+
timewarp.97,"wiki,news,webshop",llm_judge,97,test
99+
timewarp.98,"wiki,news",llm_judge,98,test
100+
timewarp.99,"wiki,news,webshop",llm_judge,99,test
101+
timewarp.100,"wiki,news,webshop",llm_judge,100,test
102+
timewarp.101,"wiki,webshop",llm_judge,101,test
103+
timewarp.102,"wiki,news",llm_judge,102,test
104+
timewarp.103,"wiki,news",llm_judge,103,test
105+
timewarp.104,wiki,llm_judge,104,train
106+
timewarp.105,wiki,llm_judge,105,train
107+
timewarp.106,wiki,llm_judge,106,train
108+
timewarp.107,wiki,llm_judge,107,train
109+
timewarp.108,wiki,llm_judge,108,train
110+
timewarp.109,wiki,llm_judge,109,train
111+
timewarp.110,wiki,llm_judge,110,train
112+
timewarp.111,wiki,llm_judge,111,train
113+
timewarp.112,wiki,llm_judge,112,train
114+
timewarp.113,wiki,llm_judge,113,train
115+
timewarp.114,wiki,llm_judge,114,train
116+
timewarp.115,wiki,llm_judge,115,train
117+
timewarp.116,wiki,llm_judge,116,train
118+
timewarp.117,wiki,llm_judge,117,train
119+
timewarp.118,wiki,llm_judge,118,train
120+
timewarp.119,wiki,llm_judge,119,train
121+
timewarp.120,wiki,llm_judge,120,train
122+
timewarp.121,wiki,llm_judge,121,train
123+
timewarp.122,wiki,llm_judge,122,train
124+
timewarp.123,wiki,llm_judge,123,train
125+
timewarp.124,wiki,llm_judge,124,train
126+
timewarp.125,wiki,llm_judge,125,train
127+
timewarp.126,wiki,llm_judge,126,train
128+
timewarp.127,wiki,llm_judge,127,train
129+
timewarp.128,wiki,llm_judge,128,train
130+
timewarp.129,wiki,llm_judge,129,train
131+
timewarp.130,wiki,llm_judge,130,train
132+
timewarp.131,wiki,llm_judge,131,train
133+
timewarp.132,wiki,llm_judge,132,train
134+
timewarp.133,wiki,llm_judge,133,train
135+
timewarp.134,wiki,llm_judge,134,train
136+
timewarp.135,wiki,llm_judge,135,train
137+
timewarp.136,wiki,llm_judge,136,train
138+
timewarp.137,wiki,llm_judge,137,train
139+
timewarp.138,wiki,llm_judge,138,train
140+
timewarp.139,wiki,llm_judge,139,train
141+
timewarp.140,wiki,llm_judge,140,train
142+
timewarp.141,wiki,llm_judge,141,train
143+
timewarp.142,wiki,llm_judge,142,train
144+
timewarp.143,news,llm_judge,143,train
145+
timewarp.144,news,llm_judge,144,train
146+
timewarp.145,news,llm_judge,145,train
147+
timewarp.146,news,llm_judge,146,train
148+
timewarp.147,news,llm_judge,147,train
149+
timewarp.148,news,llm_judge,148,train
150+
timewarp.149,news,llm_judge,149,train
151+
timewarp.150,news,llm_judge,150,train
152+
timewarp.151,news,llm_judge,151,train
153+
timewarp.152,news,llm_judge,152,train
154+
timewarp.153,news,llm_judge,153,train
155+
timewarp.154,news,llm_judge,154,train
156+
timewarp.155,news,llm_judge,155,train
157+
timewarp.156,news,llm_judge,156,train
158+
timewarp.157,news,llm_judge,157,train
159+
timewarp.158,news,llm_judge,158,train
160+
timewarp.159,news,llm_judge,159,train
161+
timewarp.160,news,llm_judge,160,train
162+
timewarp.161,news,llm_judge,161,train
163+
timewarp.162,news,llm_judge,162,train
164+
timewarp.163,news,llm_judge,163,train
165+
timewarp.164,news,llm_judge,164,train
166+
timewarp.165,news,llm_judge,165,train
167+
timewarp.166,news,llm_judge,166,train
168+
timewarp.167,news,llm_judge,167,train
169+
timewarp.168,webshop,llm_judge,168,train
170+
timewarp.169,webshop,llm_judge,169,train
171+
timewarp.170,webshop,llm_judge,170,train
172+
timewarp.171,webshop,llm_judge,171,train
173+
timewarp.172,webshop,llm_judge,172,train
174+
timewarp.173,webshop,llm_judge,173,train
175+
timewarp.174,webshop,llm_judge,174,train
176+
timewarp.175,webshop,llm_judge,175,train
177+
timewarp.176,webshop,llm_judge,176,train
178+
timewarp.177,webshop,llm_judge,177,train
179+
timewarp.178,webshop,llm_judge,178,train
180+
timewarp.179,webshop,llm_judge,179,train
181+
timewarp.180,webshop,llm_judge,180,train
182+
timewarp.181,webshop,llm_judge,181,train
183+
timewarp.182,webshop,llm_judge,182,train
184+
timewarp.183,webshop,llm_judge,183,train
185+
timewarp.184,webshop,llm_judge,184,train
186+
timewarp.185,webshop,llm_judge,185,train
187+
timewarp.186,webshop,llm_judge,186,train
188+
timewarp.187,webshop,llm_judge,187,train
189+
timewarp.188,webshop,llm_judge,188,train
190+
timewarp.189,webshop,llm_judge,189,train
191+
timewarp.190,webshop,llm_judge,190,train
192+
timewarp.191,webshop,llm_judge,191,train
193+
timewarp.192,webshop,llm_judge,192,train
194+
timewarp.193,webshop,llm_judge,193,train
195+
timewarp.194,webshop,llm_judge,194,train
196+
timewarp.195,webshop,llm_judge,195,train
197+
timewarp.196,webshop,llm_judge,196,train
198+
timewarp.197,webshop,llm_judge,197,train
199+
timewarp.198,webshop,llm_judge,198,train
200+
timewarp.199,webshop,llm_judge,199,train
201+
timewarp.200,webshop,llm_judge,200,train
202+
timewarp.201,webshop,llm_judge,201,train
203+
timewarp.202,webshop,llm_judge,202,train
204+
timewarp.203,webshop,llm_judge,203,train
205+
timewarp.204,webshop,llm_judge,204,train
206+
timewarp.205,"wiki,news",llm_judge,205,train
207+
timewarp.206,"wiki,news",llm_judge,206,train
208+
timewarp.207,"wiki,webshop,news",llm_judge,207,train
209+
timewarp.208,"news,webshop",llm_judge,208,train
210+
timewarp.209,"wiki,news",llm_judge,209,train
211+
timewarp.210,"wiki,webshop,news",llm_judge,210,train
212+
timewarp.211,"news,webshop",llm_judge,211,train
213+
timewarp.212,"wiki,webshop",llm_judge,212,train
214+
timewarp.213,"wiki,webshop",llm_judge,213,train
215+
timewarp.214,"wiki,news",llm_judge,214,train
216+
timewarp.215,"wiki,news",llm_judge,215,train
217+
timewarp.216,"wiki,news,webshop",llm_judge,216,train
218+
timewarp.217,"wiki,news,webshop",llm_judge,217,train
219+
timewarp.218,"news,webshop",llm_judge,218,train
220+
timewarp.219,"wiki,news,webshop",llm_judge,219,train
221+
timewarp.220,"wiki,news,webshop",llm_judge,220,train
222+
timewarp.221,"wiki,news,webshop",llm_judge,221,train
223+
timewarp.222,"wiki,webshop",llm_judge,222,train
224+
timewarp.223,"wiki,news",llm_judge,223,train
225+
timewarp.224,"wiki,news",llm_judge,224,train
226+
timewarp.225,"wiki,news",llm_judge,225,train
227+
timewarp.226,"news,webshop",llm_judge,226,train
228+
timewarp.227,"news,webshop",llm_judge,227,train
229+
timewarp.228,"wiki,news",llm_judge,228,train
230+
timewarp.229,"news,webshop",llm_judge,229,train
231+
timewarp.230,"wiki,news",llm_judge,230,train
232+
timewarp.231,"wiki,news,webshop",llm_judge,231,train

browsergym/experiments/src/browsergym/experiments/benchmark/utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,8 @@ def prepare_backend(backend: str):
260260
all_tasks.extend(weblinx_browsergym.list_tasks(split=split, cache_dir=cache_dir))
261261
demo_ids = weblinx_browsergym.get_unique_demo_ids(tasks=all_tasks)
262262
weblinx_browsergym.download_and_unzip_demos(demo_ids=demo_ids, cache_dir=cache_dir)
263-
263+
case "timewarp":
264+
import browsergym.timewarp
264265
case _:
265266
raise NotImplementedError(f"Unknown benchmark backend {repr(backend)}")
266267

browsergym/experiments/src/browsergym/experiments/loop.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -944,6 +944,8 @@ def _get_env_name(task_name: str):
944944
import browsergym.assistantbench
945945
elif task_name.startswith("weblinx"):
946946
import weblinx_browsergym
947+
elif task_name.startswith("timewarp"):
948+
import browsergym.timewarp
947949

948950
return f"browsergym/{task_name}"
949951

browsergym/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ dependencies = [
4040
"browsergym-workarena>=0.4.1",
4141
"weblinx-browsergym>=0.0.2",
4242
"browsergym-webarenalite==0.14.3",
43-
"browsergym-webarena-verified==0.14.3"
43+
"browsergym-webarena-verified==0.14.3",
4444
]
4545

4646
[tool.setuptools]

dev/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@ tenacity
1515
-e ../browsergym/webarena_verified # local package
1616
browsergym-workarena
1717
weblinx_browsergym
18+
browsergym-timewarp

0 commit comments

Comments
 (0)