Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions browsertrace/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

from __future__ import annotations

import json
import sqlite3
from typing import Any


def run_summary(run: sqlite3.Row) -> dict[str, str]:
Expand All @@ -25,16 +27,126 @@ def step_for_compare(step: sqlite3.Row | None) -> dict[str, object] | None:
}


def _parse_json_field(value: Any) -> Any:
if value in (None, ""):
return None
if isinstance(value, (dict, list)):
return value
if isinstance(value, (bytes, bytearray)):
value = value.decode("utf-8", errors="ignore")
if isinstance(value, str):
try:
return json.loads(value)
except (TypeError, ValueError):
return None
return None


def _deep_get(data: dict[str, Any], paths: list[tuple[str, ...]]) -> str:
for path in paths:
cursor: Any = data
ok = True
for key in path:
if isinstance(cursor, dict) and key in cursor:
cursor = cursor[key]
else:
ok = False
break
if ok and cursor not in (None, ""):
return str(cursor)
return ""


def compare_metadata(steps: list[sqlite3.Row]) -> dict[str, str]:
"""Extract run-comparison metadata from step metadata/model_input payloads.

This is best-effort and intentionally narrow: only keys that help determine
whether two runs are reasonably comparable are surfaced.
"""
fields: dict[str, str] = {
"browser_use_version": "",
"browsertrace_version": "",
"model_provider": "",
"model": "",
"prompt_template_version": "",
}

metadata_paths: dict[str, list[tuple[str, ...]]] = {
"browser_use_version": [
("browser_use_version",),
("browseruse_version",),
],
"browsertrace_version": [
("browsertrace_version",),
],
"model_provider": [
("model_provider",),
("provider",),
("llm_provider",),
("model", "provider"),
],
"model": [
("model",),
("model_name",),
("llm_model",),
("model", "name"),
],
"prompt_template_version": [
("prompt_template_version",),
("prompt_version",),
("template_version",),
("prompt", "template_version"),
],
}

for step in steps:
metadata = _parse_json_field(step["metadata"])
model_input = _parse_json_field(step["model_input"])
metadata_dict = metadata if isinstance(metadata, dict) else {}
nested_metadata = metadata_dict.get("metadata") if isinstance(metadata_dict.get("metadata"), dict) else {}
candidates = [
metadata_dict,
nested_metadata,
model_input if isinstance(model_input, dict) else {},
]

for field, paths in metadata_paths.items():
if fields[field]:
continue
for source in candidates:
value = _deep_get(source, paths)
if value:
fields[field] = value
break

if all(fields.values()):
break

return fields


def compare_runs(
left_run: sqlite3.Row,
left_steps: list[sqlite3.Row],
right_run: sqlite3.Row,
right_steps: list[sqlite3.Row],
) -> dict[str, object]:
left_metadata = compare_metadata(left_steps)
right_metadata = compare_metadata(right_steps)

payload: dict[str, object] = {
"left": run_summary(left_run),
"right": run_summary(right_run),
"step_counts": {"left": len(left_steps), "right": len(right_steps)},
"compare_metadata": {
"left": left_metadata,
"right": right_metadata,
"differences": {
key: {"left": left_metadata[key], "right": right_metadata[key]}
for key in left_metadata.keys()
if left_metadata[key] != right_metadata[key]
},
},
"first_divergence": None,
}

Expand Down
85 changes: 85 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,51 @@ def _seed_compare_runs(tmp_path):
return failed_id, success_id


def _seed_compare_runs_with_metadata(tmp_path):
tracer = Tracer(home=tmp_path)

with tracer.run("browser-use success metadata") as run:
run.step(
action="navigate",
url="https://example.com/start",
metadata={
"browser_use_version": "0.1.45",
"browsertrace_version": "0.1.19",
"model_provider": "openai",
"model": "gpt-4o-mini",
"prompt_template_version": "prompt-v3",
},
)
run.step(action="click(selector=#checkout)", url="https://example.com/done")
success_id = run.id

try:
with tracer.run("browser-use failure metadata") as run:
run.step(
action="navigate",
url="https://example.com/start",
model_input={
"browser_use_version": "0.1.46",
"browsertrace_version": "0.1.19",
"model_provider": "openai",
"model": "gpt-4.1-mini",
"prompt_template_version": "prompt-v4",
},
)
run.step(
action="click(selector=#cancel)",
url="https://example.com/cart",
status="error",
error="wrong target",
)
failed_id = run.id
raise RuntimeError("wrong target")
except RuntimeError:
pass

return failed_id, success_id


def test_cli_module_compiles_on_python311():
"""Guard against Python 3.11 f-string syntax regressions.

Expand Down Expand Up @@ -202,6 +247,14 @@ def test_cli_compare_json_reports_first_divergent_step(cli):
assert payload["left"]["status"] == "failed"
assert payload["right"]["id"] == success_id
assert payload["right"]["status"] == "completed"
assert payload["compare_metadata"]["left"] == {
"browser_use_version": "",
"browsertrace_version": "",
"model_provider": "",
"model": "",
"prompt_template_version": "",
}
assert payload["compare_metadata"]["differences"] == {}
assert payload["first_divergence"]["step_index"] == 1
assert payload["first_divergence"]["left_step"]["action"] == "click(selector=#cancel)"
assert payload["first_divergence"]["right_step"]["action"] == "click(selector=#checkout)"
Expand All @@ -219,6 +272,38 @@ def test_cli_compare_json_reports_first_divergent_step(cli):
}


def test_cli_compare_json_includes_compare_metadata_and_differences(cli):
cli_mod, tmp_path = cli
failed_id, success_id = _seed_compare_runs_with_metadata(tmp_path)

buf = StringIO()
with redirect_stdout(buf):
rc = cli_mod.main(["compare", failed_id[:8], success_id[:8], "--json"])

payload = json.loads(buf.getvalue())

assert rc == 0
assert payload["compare_metadata"]["left"] == {
"browser_use_version": "0.1.46",
"browsertrace_version": "0.1.19",
"model_provider": "openai",
"model": "gpt-4.1-mini",
"prompt_template_version": "prompt-v4",
}
assert payload["compare_metadata"]["right"] == {
"browser_use_version": "0.1.45",
"browsertrace_version": "0.1.19",
"model_provider": "openai",
"model": "gpt-4o-mini",
"prompt_template_version": "prompt-v3",
}
assert payload["compare_metadata"]["differences"] == {
"browser_use_version": {"left": "0.1.46", "right": "0.1.45"},
"model": {"left": "gpt-4.1-mini", "right": "gpt-4o-mini"},
"prompt_template_version": {"left": "prompt-v4", "right": "prompt-v3"},
}


def test_cli_compare_human_output_mentions_first_divergence(cli):
cli_mod, tmp_path = cli
failed_id, success_id = _seed_compare_runs(tmp_path)
Expand Down