Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions packages/browseros-agent/apps/agent/lib/tool-labels.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ const VERB_OVERRIDES: Record<string, string> = {
hover: 'Hovered',
hover_at: 'Hovered at coordinates',
type_at: 'Typed at coordinates',
type_text: 'Typed text',
drag_at: 'Dragged',
focus: 'Focused element',
fill: 'Filled field',
Expand Down Expand Up @@ -186,8 +187,8 @@ const SUBJECT_EXTRACTORS: Record<string, SubjectExtractor> = {
find_files: (i) => quote(stringField(i, 'pattern', 'query')),

// Element interactions
click: (i) => stringField(i, 'element'),
hover: (i) => stringField(i, 'element'),
click: (i) => stringField(i, 'prompt'),
hover: (i) => stringField(i, 'prompt', 'element'),
focus: (i) => stringField(i, 'element'),
clear: (i) => stringField(i, 'element'),
check: (i) => stringField(i, 'element'),
Expand All @@ -199,6 +200,7 @@ const SUBJECT_EXTRACTORS: Record<string, SubjectExtractor> = {
return target ?? truncate(text, 40)
},
press_key: (i) => stringField(i, 'key'),
type_text: (i) => truncate(stringField(i, 'text'), 40),

// Coordinate-based input
click_at: (i) => coords(i.x, i.y),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"agent": {
"type": "single",
"provider": "openai-compatible",
"model": "openai/gpt-5.5",
"apiKey": "OPENROUTER_API_KEY",
"baseUrl": "https://openrouter.ai/api/v1",
"supportsImages": true
},
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 20,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["agisdk_state_diff"],
"timeout_ms": 1800000
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"supportsImages": true
},
"dataset": "../../data/agisdk-real-smoke.jsonl",
"num_workers": 1,
"num_workers": 20,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"supportsImages": true
},
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 4,
"num_workers": 20,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
{
"agent": {
"type": "single",
"provider": "bedrock",
"model": "global.anthropic.claude-opus-4-6-v1",
"region": "AWS_REGION",
"accessKeyId": "AWS_ACCESS_KEY_ID",
"secretAccessKey": "AWS_SECRET_ACCESS_KEY",
"supportsImages": true
"provider": "openrouter",
"model": "anthropic/claude-opus-4.6",
"apiKey": "OPENROUTER_API_KEY",
"supportsImages": true,
"reasoning": {
"enabled": true
},
"verbosity": "high",
"providerRouting": {
"only": ["amazon-bedrock"],
"allowFallbacks": false
}
},
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 2,
"num_workers": 10,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"type": "single"
},
"graders": ["agisdk_state_diff"],
"workers": 1,
"workers": 20,
"restartBrowserPerTask": true,
"timeoutMs": 1800000,
"browseros": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"type": "single"
},
"graders": ["agisdk_state_diff"],
"workers": 1,
"workers": 20,
"restartBrowserPerTask": true,
"timeoutMs": 1800000,
"browseros": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"type": "single"
},
"graders": ["agisdk_state_diff"],
"workers": 1,
"workers": 20,
"restartBrowserPerTask": true,
"timeoutMs": 1800000,
"browseros": {
Expand Down
244 changes: 163 additions & 81 deletions packages/browseros-agent/apps/eval/src/agents/single-agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,52 @@ import { CdpBackend } from '@browseros/server/browser/backends/cdp'
import { registry } from '@browseros/server/tools/registry'
import { CaptchaWaiter } from '../capture/captcha-waiter'
import { DEFAULT_TIMEOUT_MS } from '../constants'
import type { TaskMetadata } from '../types'
import type { TaskMetadata, UIMessageStreamEvent } from '../types'
import {
isProviderExecutionError,
retryProviderErrors,
} from '../utils/provider-error-retry'
import { resolveProviderConfig } from '../utils/resolve-provider-config'
import { withEvalTimeout } from '../utils/with-eval-timeout'
import type { AgentContext, AgentEvaluator, AgentResult } from './types'

const EMPTY_TOOL_RESULT_STOP_CONTINUATION_LIMIT = 2

interface ToolLoopResultShape {
text: string
finishReason: string
toolCalls: readonly unknown[]
steps: ReadonlyArray<{
toolResults: readonly unknown[]
}>
}

export function shouldContinueAfterEmptyToolResultStop(
result: ToolLoopResultShape,
): boolean {
const previousStep = result.steps.at(-2)

return (
result.finishReason === 'stop' &&
result.text.trim().length === 0 &&
result.toolCalls.length === 0 &&
(previousStep?.toolResults.length ?? 0) > 0
)
}

export function buildEmptyToolResultStopContinuationPrompt(
taskQuery: string,
): string {
return [
'Continue the eval task from the current browser state.',
'',
'The previous model response stopped immediately after a tool result without issuing another tool call or a final answer. Do not stop after routine tool results. If the requested workflow is complete, respond with a brief completion message. Otherwise, inspect the page if needed and continue using tools.',
'',
'Original task:',
taskQuery,
].join('\n')
}

export class SingleAgentEvaluator implements AgentEvaluator {
constructor(private ctx: AgentContext) {}

Expand Down Expand Up @@ -89,87 +130,128 @@ export class SingleAgentEvaluator implements AgentEvaluator {
capture,
async (signal) => {
if (!agent) throw new Error('Agent was not initialized')
// Format prompt with browser context so the agent knows what page it's on
// (same formatting as chat-service.ts → formatUserMessage)
const prompt = formatUserMessage(task.query, browserContext)
const result = await agent.toolLoopAgent.generate({
prompt,
abortSignal: signal,

experimental_onToolCallStart: ({ toolCall }) => {
const input = toolCall.input as
| Record<string, unknown>
| undefined
if (input && typeof input.page === 'number') {
capture.setActivePageId(input.page)
}
},

experimental_onToolCallFinish: async () => {
try {
if (captchaWaiter) {
await captchaWaiter.waitIfCaptchaPresent(
browser,
capture.getActivePageId(),
)
}
const screenshotNum = await capture.screenshot.capture(
capture.getActivePageId(),
)
capture.emitEvent(task.query_id, {
type: 'screenshot-captured',
screenshot: screenshotNum,
})
} catch {
// Screenshot failures are non-fatal
}
},

onStepFinish: async ({ toolCalls, toolResults, text }) => {
if (toolCalls) {
for (const tc of toolCalls) {
const inputEvent = {
type: 'tool-input-available',
toolCallId: tc.toolCallId,
toolName: tc.toolName,
input: tc.input,
} as any
await capture.messageLogger.logStreamEvent(inputEvent)
capture.emitEvent(task.query_id, inputEvent)
}
}

if (toolResults) {
for (const tr of toolResults) {
const outputEvent = {
type: 'tool-output-available',
toolCallId: tr.toolCallId,
output: tr.output,
} as any
await capture.messageLogger.logStreamEvent(outputEvent)
capture.emitEvent(task.query_id, outputEvent)
}
}

if (text) {
const textId = randomUUID()
const startEvent = { type: 'text-start', id: textId } as any
const deltaEvent = {
type: 'text-delta',
id: textId,
delta: text,
} as any
const endEvent = { type: 'text-end', id: textId } as any
await capture.messageLogger.logStreamEvent(startEvent)
await capture.messageLogger.logStreamEvent(deltaEvent)
await capture.messageLogger.logStreamEvent(endEvent)
capture.emitEvent(task.query_id, deltaEvent)
}
},
})

finalText = result.text || null
const activeAgent = agent

let continuationCount = 0
let currentQuery = task.query

for (;;) {
// Format prompt with browser context so the agent knows what page it's on
// (same formatting as chat-service.ts → formatUserMessage)
const prompt = formatUserMessage(currentQuery, browserContext)
const result = await retryProviderErrors(
() =>
activeAgent.toolLoopAgent.generate({
prompt,
abortSignal: signal,

experimental_onToolCallStart: ({ toolCall }) => {
const input = toolCall.input as
| Record<string, unknown>
| undefined
if (input && typeof input.page === 'number') {
capture.setActivePageId(input.page)
}
},

experimental_onToolCallFinish: async () => {
try {
if (captchaWaiter) {
await captchaWaiter.waitIfCaptchaPresent(
browser,
capture.getActivePageId(),
)
}
const screenshotNum = await capture.screenshot.capture(
capture.getActivePageId(),
)
capture.emitEvent(task.query_id, {
type: 'screenshot-captured',
screenshot: screenshotNum,
})
} catch {
// Screenshot failures are non-fatal
}
},

onStepFinish: async ({ toolCalls, toolResults, text }) => {
if (toolCalls) {
for (const tc of toolCalls) {
const inputEvent: UIMessageStreamEvent = {
type: 'tool-input-available',
toolCallId: tc.toolCallId,
toolName: tc.toolName,
input: tc.input,
}
await capture.messageLogger.logStreamEvent(inputEvent)
capture.emitEvent(task.query_id, inputEvent)
}
}

if (toolResults) {
for (const tr of toolResults) {
const outputEvent: UIMessageStreamEvent = {
type: 'tool-output-available',
toolCallId: tr.toolCallId,
output: tr.output,
}
await capture.messageLogger.logStreamEvent(outputEvent)
capture.emitEvent(task.query_id, outputEvent)
}
}

if (text) {
const textId = randomUUID()
const startEvent: UIMessageStreamEvent = {
type: 'text-start',
id: textId,
}
const deltaEvent: UIMessageStreamEvent = {
type: 'text-delta',
id: textId,
delta: text,
}
const endEvent: UIMessageStreamEvent = {
type: 'text-end',
id: textId,
}
await capture.messageLogger.logStreamEvent(startEvent)
await capture.messageLogger.logStreamEvent(deltaEvent)
await capture.messageLogger.logStreamEvent(endEvent)
capture.emitEvent(task.query_id, deltaEvent)
}
},
}),
{
label: `single-agent ${task.query_id}`,
signal,
},
)

if (!shouldContinueAfterEmptyToolResultStop(result)) {
finalText = result.text || null
break
}

if (
continuationCount >= EMPTY_TOOL_RESULT_STOP_CONTINUATION_LIMIT
) {
throw new Error(
`Model stopped with empty output immediately after a tool result ${continuationCount + 1} times`,
)
}

continuationCount++
capture.addWarning(
'agent_execution',
`Model stopped with empty output immediately after a tool result; continuing task (${continuationCount}/${EMPTY_TOOL_RESULT_STOP_CONTINUATION_LIMIT})`,
)
currentQuery = buildEmptyToolResultStopContinuationPrompt(
task.query,
)
}
},
{ rethrowError: isProviderExecutionError },
)

const endTime = Date.now()
Expand Down
Loading
Loading