From ddaf0ddf27a60a2ddac41f22413227e3745d7422 Mon Sep 17 00:00:00 2001 From: topaz Date: Fri, 8 May 2026 18:26:21 -0700 Subject: [PATCH] feat: add wait and custom code browser tools (bosmain-6fd) --- .../apps/server/src/agent/prompt.ts | 5 +- .../apps/server/src/browser/browser.ts | 66 +++++++- .../apps/server/src/tools/navigation.ts | 78 ++++++++-- .../apps/server/src/tools/registry.ts | 9 +- .../apps/server/src/tools/snapshot.ts | 49 ++++++ .../server/src/tools/tool-label-registry.ts | 11 +- .../server/tests/tools/navigation.test.ts | 47 ++++++ .../server/tests/tools/observation.test.ts | 55 +++++++ .../apps/server/tests/tools/registry.test.ts | 142 ++++++++++++++++++ 9 files changed, 435 insertions(+), 27 deletions(-) create mode 100644 packages/browseros-agent/apps/server/tests/tools/registry.test.ts diff --git a/packages/browseros-agent/apps/server/src/agent/prompt.ts b/packages/browseros-agent/apps/server/src/agent/prompt.ts index 52be6842c..39db23f21 100644 --- a/packages/browseros-agent/apps/server/src/agent/prompt.ts +++ b/packages/browseros-agent/apps/server/src/agent/prompt.ts @@ -138,6 +138,7 @@ You control a Chromium browser. Key tool categories: - \`get_dom\` / \`search_dom\` → raw HTML (use for precise CSS/XPath queries) - \`take_screenshot\` → visual capture (use for verification or saving) - \`evaluate_script\` → run JS on the page (use for dynamic data extraction) +- \`browser_run_code\` → run custom async page code when predefined tools are insufficient - \`get_console_logs\` → browser console output (use for debugging) **Interaction** — act on page elements: @@ -152,6 +153,7 @@ You control a Chromium browser. Key tool categories: **Navigation**: - \`navigate_page\` → go to URL, back, forward, reload - \`new_page\` → open new tab (only when user explicitly asks) +- \`wait_for\` → wait for text/selectors to appear or disappear, or pause briefly - \`close_page\` → close a tab **Bookmarks**: \`get_bookmarks\`, \`create_bookmark\`, \`remove_bookmark\`, \`update_bookmark\`, \`move_bookmark\`, \`search_bookmarks\` @@ -315,6 +317,7 @@ function getToolSelection( | Looking for specific links | \`get_page_links\` | | Need exact HTML or CSS selectors | \`get_dom\` or \`search_dom\` | | Need runtime data (JS variables, computed values) | \`evaluate_script\` | +| Need custom async page logic | \`browser_run_code\` | | Something isn't working, need to debug | \`get_console_logs\` | | Need visual proof or to save an image | \`take_screenshot\` or \`save_screenshot\` | @@ -417,7 +420,7 @@ function getErrorRecovery( ## Error Recovery ### Browser interaction errors -- Element not found → \`scroll(page, "down")\`, \`wait_for(page, text)\`, then \`take_snapshot(page)\` to re-fetch elements +- Element not found → \`scroll(page, "down")\`, \`wait_for(page, { text })\`, then \`take_snapshot(page)\` to re-fetch elements - Click/fill failed → \`scroll(page, "down", element)\` into view, retry once - Page didn't load → check URL, try \`navigate_page\` with reload - After 2 failed attempts → describe the blocking issue, request guidance diff --git a/packages/browseros-agent/apps/server/src/browser/browser.ts b/packages/browseros-agent/apps/server/src/browser/browser.ts index e4a45cd83..86cd34476 100644 --- a/packages/browseros-agent/apps/server/src/browser/browser.ts +++ b/packages/browseros-agent/apps/server/src/browser/browser.ts @@ -657,13 +657,19 @@ export class Browser { async waitFor( page: number, - opts: { text?: string; selector?: string; timeout: number }, + opts: { + text?: string + textGone?: string + selector?: string + selectorGone?: string + timeout: number + }, ): Promise { const session = await this.resolveSession(page) const deadline = Date.now() + opts.timeout const interval = 500 - while (Date.now() < deadline) { + while (Date.now() <= deadline) { if (opts.text) { const result = await session.Runtime.evaluate({ expression: `document.body?.innerText?.includes(${JSON.stringify(opts.text)}) ?? false`, @@ -672,6 +678,14 @@ export class Browser { if (result.result?.value === true) return true } + if (opts.textGone) { + const result = await session.Runtime.evaluate({ + expression: `!(document.body?.innerText?.includes(${JSON.stringify(opts.textGone)}) ?? false)`, + returnByValue: true, + }) + if (result.result?.value === true) return true + } + if (opts.selector) { const result = await session.Runtime.evaluate({ expression: `!!document.querySelector(${JSON.stringify(opts.selector)})`, @@ -680,7 +694,17 @@ export class Browser { if (result.result?.value === true) return true } - await new Promise((r) => setTimeout(r, interval)) + if (opts.selectorGone) { + const result = await session.Runtime.evaluate({ + expression: `!document.querySelector(${JSON.stringify(opts.selectorGone)})`, + returnByValue: true, + }) + if (result.result?.value === true) return true + } + + const remaining = deadline - Date.now() + if (remaining <= 0) break + await new Promise((r) => setTimeout(r, Math.min(interval, remaining))) } return false @@ -941,6 +965,42 @@ export class Browser { } } + async runCode( + page: number, + code: string, + args?: Record, + ): Promise<{ + value?: unknown + error?: string + description?: string + }> { + const session = await this.resolveSession(page) + const expression = `( + async (args) => { + ${code} + } + )(${JSON.stringify(args ?? {})})` + + const result = await session.Runtime.evaluate({ + expression, + returnByValue: true, + awaitPromise: true, + }) + + if (result.exceptionDetails) { + return { + error: + result.exceptionDetails.exception?.description ?? + result.exceptionDetails.text, + } + } + + return { + value: result.result?.value, + description: result.result?.description, + } + } + async getDom(page: number, opts?: { selector?: string }): Promise { const session = await this.resolveSession(page) const doc = await session.DOM.getDocument({ depth: 0 }) diff --git a/packages/browseros-agent/apps/server/src/tools/navigation.ts b/packages/browseros-agent/apps/server/src/tools/navigation.ts index 37747dcc6..2e8a4ced1 100644 --- a/packages/browseros-agent/apps/server/src/tools/navigation.ts +++ b/packages/browseros-agent/apps/server/src/tools/navigation.ts @@ -299,13 +299,29 @@ export const close_page = defineNavigationTool({ export const wait_for = defineNavigationTool({ name: 'wait_for', description: - 'Wait for text or a CSS selector to appear on the page. Polls periodically up to a timeout.', + 'Wait for text or a CSS selector to appear or disappear on the page, or wait for a fixed time. Polls periodically up to a timeout.', input: z.object({ page: pageParam, text: z.string().optional().describe('Text to wait for on the page'), + textGone: z + .string() + .optional() + .describe('Text to wait for to disappear from the page'), selector: z.string().optional().describe('CSS selector to wait for'), + selectorGone: z + .string() + .optional() + .describe('CSS selector to wait for to disappear from the page'), + time: z + .number() + .min(0) + .max(120000) + .optional() + .describe('Fixed wait time in milliseconds'), timeout: z .number() + .min(0) + .max(120000) .default(10000) .describe('Maximum wait time in milliseconds'), }), @@ -316,40 +332,76 @@ export const wait_for = defineNavigationTool({ timeout: z.number(), }), handler: async (args, ctx, response) => { - if (!args.text && !args.selector) { - response.error('Provide either text or selector to wait for.') + const timeout = args.timeout ?? 10_000 + const target = + args.text !== undefined + ? `text "${args.text}"` + : args.textGone !== undefined + ? `text "${args.textGone}" to disappear` + : args.selector !== undefined + ? `selector "${args.selector}"` + : args.selectorGone !== undefined + ? `selector "${args.selectorGone}" to disappear` + : args.time !== undefined + ? `${args.time}ms` + : '' + + if ( + !args.text && + !args.textGone && + !args.selector && + !args.selectorGone && + args.time === undefined + ) { + response.error( + 'Provide text, textGone, selector, selectorGone, or time to wait for.', + ) + return + } + + if ( + args.time !== undefined && + !args.text && + !args.textGone && + !args.selector && + !args.selectorGone + ) { + await new Promise((resolve) => setTimeout(resolve, args.time)) + response.text(`Waited ${args.time}ms.`) + response.data({ + page: args.page, + found: true, + target, + timeout: args.time, + }) return } const found = await ctx.browser.waitFor(args.page, { text: args.text, + textGone: args.textGone, selector: args.selector, - timeout: args.timeout, + selectorGone: args.selectorGone, + timeout, }) if (found) { - const target = args.text - ? `text "${args.text}"` - : `selector "${args.selector}"` response.text(`Found ${target} on page.`) response.data({ page: args.page, found, target, - timeout: args.timeout, + timeout, }) response.includeSnapshot(args.page) } else { - const target = args.text - ? `text "${args.text}"` - : `selector "${args.selector}"` response.data({ page: args.page, found, target, - timeout: args.timeout, + timeout, }) - response.error(`Timed out after ${args.timeout}ms waiting for ${target}.`) + response.error(`Timed out after ${timeout}ms waiting for ${target}.`) } }, }) diff --git a/packages/browseros-agent/apps/server/src/tools/registry.ts b/packages/browseros-agent/apps/server/src/tools/registry.ts index ae119d70b..0f03e0cbc 100644 --- a/packages/browseros-agent/apps/server/src/tools/registry.ts +++ b/packages/browseros-agent/apps/server/src/tools/registry.ts @@ -43,12 +43,12 @@ import { new_hidden_page, new_page, show_page, - // biome-ignore lint/correctness/noUnusedImports: temporarily disabled wait_for, } from './navigation' import { suggest_app_connection, suggest_schedule } from './nudges' import { download_file, save_pdf, save_screenshot } from './page-actions' import { + browser_run_code, evaluate_script, get_page_content, get_page_links, @@ -73,7 +73,7 @@ import { } from './windows' export const registry = createRegistry([ - // Navigation (8) + // Navigation (9) get_active_page, list_pages, navigate_page, @@ -82,9 +82,9 @@ export const registry = createRegistry([ show_page, move_page, close_page, - // wait_for, // temporarily disabled + wait_for, - // Observation (9) + // Observation (10) take_snapshot, take_enhanced_snapshot, get_page_content, @@ -93,6 +93,7 @@ export const registry = createRegistry([ search_dom, take_screenshot, evaluate_script, + browser_run_code, get_console_logs, // Input (17) diff --git a/packages/browseros-agent/apps/server/src/tools/snapshot.ts b/packages/browseros-agent/apps/server/src/tools/snapshot.ts index 30b469d0a..b56c7f304 100644 --- a/packages/browseros-agent/apps/server/src/tools/snapshot.ts +++ b/packages/browseros-agent/apps/server/src/tools/snapshot.ts @@ -246,3 +246,52 @@ export const evaluate_script = defineScriptTool({ }) }, }) + +export const browser_run_code = defineScriptTool({ + name: 'browser_run_code', + description: + 'Execute async custom JavaScript code in the page context. The code runs as an async function body with a serializable args object available and may use return to produce a result.', + input: z.object({ + page: pageParam, + code: z + .string() + .describe( + 'JavaScript function body to run in the page context. Use return to provide output.', + ), + args: z + .record(z.unknown()) + .optional() + .describe('Serializable arguments available to the code as args'), + }), + output: z.object({ + text: z.string(), + value: z.unknown().optional(), + description: z.string().optional(), + }), + handler: async (args, ctx, response) => { + const result = await ctx.browser.runCode(args.page, args.code, args.args) + + if (result.error) { + response.error(`Code error: ${result.error}`) + return + } + + const val = result.value + let text: string + if (val === undefined) { + text = result.description ?? 'undefined' + response.text(text) + } else if (typeof val === 'string') { + text = val + response.text(text) + } else { + text = JSON.stringify(val, null, 2) + response.text(text) + } + response.data({ + text, + value: result.value, + description: result.description, + }) + }, +}) diff --git a/packages/browseros-agent/apps/server/src/tools/tool-label-registry.ts b/packages/browseros-agent/apps/server/src/tools/tool-label-registry.ts index 5657605e4..62c16a5c4 100644 --- a/packages/browseros-agent/apps/server/src/tools/tool-label-registry.ts +++ b/packages/browseros-agent/apps/server/src/tools/tool-label-registry.ts @@ -20,6 +20,7 @@ const VERB_OVERRIDES: Record = { get_active_page: 'Got active tab', move_page: 'Moved tab', group_tabs: 'Grouped tabs', + wait_for: 'Waited for page state', // Page reading take_snapshot: 'Captured page snapshot', @@ -47,6 +48,7 @@ const VERB_OVERRIDES: Record = { // Console / scripts evaluate_script: 'Ran script', + browser_run_code: 'Ran custom page code', get_console_logs: 'Read console logs', // History / bookmarks @@ -292,12 +294,9 @@ function canonicalName(rawName: string): string { function humanizeToolName(rawName: string): string { const stripped = canonicalName(rawName) const words = stripped.split(/[_-]/).filter((w) => w.length > 0) - if (words.length === 0) return rawName - const first = words[0]! - return [ - first.charAt(0).toUpperCase() + first.slice(1), - ...words.slice(1), - ].join(' ') + const [first, ...rest] = words + if (!first) return rawName + return [first.charAt(0).toUpperCase() + first.slice(1), ...rest].join(' ') } /** diff --git a/packages/browseros-agent/apps/server/tests/tools/navigation.test.ts b/packages/browseros-agent/apps/server/tests/tools/navigation.test.ts index 50d979a5b..c69552cb8 100644 --- a/packages/browseros-agent/apps/server/tests/tools/navigation.test.ts +++ b/packages/browseros-agent/apps/server/tests/tools/navigation.test.ts @@ -11,6 +11,7 @@ import { show_page, wait_for, } from '../../src/tools/navigation' +import { evaluate_script } from '../../src/tools/snapshot' import { close_window, create_window } from '../../src/tools/windows' import { withBrowser } from '../__helpers__/with-browser' @@ -120,6 +121,52 @@ describe('navigation tools', () => { }) }, 60_000) + it('wait_for waits for text to disappear', async () => { + await withBrowser(async ({ execute }) => { + const newResult = await execute(new_page, { url: 'about:blank' }) + const pageId = structuredOf<{ pageId: number }>(newResult).pageId + + await execute(evaluate_script, { + page: pageId, + expression: ` + document.body.textContent = 'Loading complete' + setTimeout(() => { + document.body.textContent = 'Ready' + }, 100) + `, + }) + + const waitResult = await execute(wait_for, { + page: pageId, + textGone: 'Loading complete', + timeout: 5_000, + }) + assert.ok(!waitResult.isError, textOf(waitResult)) + const data = structuredOf<{ found: boolean; page: number }>(waitResult) + assert.strictEqual(data.found, true) + assert.strictEqual(data.page, pageId) + + await execute(close_page, { page: pageId }) + }) + }, 60_000) + + it('wait_for can wait for a fixed time', async () => { + await withBrowser(async ({ execute }) => { + const newResult = await execute(new_page, { url: 'about:blank' }) + const pageId = structuredOf<{ pageId: number }>(newResult).pageId + const start = Date.now() + + const waitResult = await execute(wait_for, { + page: pageId, + time: 50, + }) + assert.ok(!waitResult.isError, textOf(waitResult)) + assert.ok(Date.now() - start >= 40, 'Expected wait_for to delay') + + await execute(close_page, { page: pageId }) + }) + }, 60_000) + it('wait_for times out for missing text', async () => { await withBrowser(async ({ execute }) => { const newResult = await execute(new_page, { url: 'about:blank' }) diff --git a/packages/browseros-agent/apps/server/tests/tools/observation.test.ts b/packages/browseros-agent/apps/server/tests/tools/observation.test.ts index 982296c64..2d59e9ca7 100644 --- a/packages/browseros-agent/apps/server/tests/tools/observation.test.ts +++ b/packages/browseros-agent/apps/server/tests/tools/observation.test.ts @@ -5,6 +5,7 @@ import { tmpdir } from 'node:os' import { dirname, join } from 'node:path' import { close_page, navigate_page, new_page } from '../../src/tools/navigation' import { + browser_run_code, evaluate_script, get_page_content, get_page_links, @@ -155,6 +156,60 @@ describe('observation tools', () => { }) }, 60_000) + it('browser_run_code runs async custom code in the page', async () => { + await withBrowser(async ({ execute }) => { + const newResult = await execute(new_page, { url: 'about:blank' }) + const pageId = pageIdOf(newResult) + + const runResult = await execute(browser_run_code, { + page: pageId, + args: { + id: 'browser-run-code-target', + text: 'custom code result', + }, + code: ` + const div = document.createElement('div') + div.id = args.id + div.textContent = await Promise.resolve(args.text) + document.body.appendChild(div) + return { + text: document.getElementById(args.id)?.textContent, + divCount: document.querySelectorAll('div').length, + } + `, + }) + + assert.ok(!runResult.isError, textOf(runResult)) + const data = structuredOf<{ + value: { text: string; divCount: number } + text: string + }>(runResult) + assert.deepStrictEqual(data.value, { + text: 'custom code result', + divCount: 1, + }) + assert.ok(data.text.includes('custom code result')) + + await execute(close_page, { page: pageId }) + }) + }, 60_000) + + it('browser_run_code reports thrown errors', async () => { + await withBrowser(async ({ execute }) => { + const newResult = await execute(new_page, { url: 'about:blank' }) + const pageId = pageIdOf(newResult) + + const runResult = await execute(browser_run_code, { + page: pageId, + code: 'throw new Error("custom failure")', + }) + assert.ok(runResult.isError, 'Expected browser_run_code to fail') + assert.ok(textOf(runResult).includes('custom failure')) + + await execute(close_page, { page: pageId }) + }) + }, 60_000) + it('get_page_content returns markdown text', async () => { await withBrowser(async ({ execute }) => { const newResult = await execute(new_page, { url: 'https://example.com' }) diff --git a/packages/browseros-agent/apps/server/tests/tools/registry.test.ts b/packages/browseros-agent/apps/server/tests/tools/registry.test.ts new file mode 100644 index 000000000..7ecb7daba --- /dev/null +++ b/packages/browseros-agent/apps/server/tests/tools/registry.test.ts @@ -0,0 +1,142 @@ +import { describe, it } from 'bun:test' +import assert from 'node:assert' +import type { Browser } from '../../src/browser/browser' +import { executeTool, type ToolContext } from '../../src/tools/framework' +import { wait_for } from '../../src/tools/navigation' +import { registry } from '../../src/tools/registry' +import { browser_run_code } from '../../src/tools/snapshot' + +function textOf(result: { + content: { type: string; text?: string }[] +}): string { + return result.content + .filter((c) => c.type === 'text') + .map((c) => c.text) + .join('\n') +} + +function structuredOf(result: { structuredContent?: unknown }): T { + assert.ok(result.structuredContent, 'Expected structuredContent') + return result.structuredContent as T +} + +function createToolContext(methods: Record): ToolContext { + return { + browser: { + getTabIdForPage: () => undefined, + snapshot: async () => '', + ...methods, + } as unknown as Browser, + directories: { workingDir: process.cwd() }, + } +} + +describe('tool registry', () => { + it('exposes wait and custom code tools to MCP clients', () => { + assert.ok(registry.get('wait_for'), 'Expected wait_for to be registered') + assert.ok( + registry.get('browser_run_code'), + 'Expected browser_run_code to be registered', + ) + }) + + it('wait_for supports fixed delays without a browser call', async () => { + let waitForCalled = false + const start = Date.now() + const result = await executeTool( + wait_for, + { page: 1, time: 10 }, + createToolContext({ + waitFor: async () => { + waitForCalled = true + return false + }, + }), + AbortSignal.timeout(1_000), + ) + + assert.ok(!result.isError, textOf(result)) + assert.ok(Date.now() - start >= 8, 'Expected wait_for to delay') + assert.strictEqual(waitForCalled, false) + assert.deepStrictEqual(structuredOf(result), { + page: 1, + found: true, + target: '10ms', + timeout: 10, + }) + }) + + it('wait_for forwards disappearance conditions to the browser', async () => { + const calls: unknown[] = [] + const result = await executeTool( + wait_for, + { page: 7, textGone: 'Loading', selectorGone: '.spinner' }, + createToolContext({ + waitFor: async (_page: number, opts: unknown) => { + calls.push(opts) + return true + }, + }), + AbortSignal.timeout(1_000), + ) + + assert.ok(!result.isError, textOf(result)) + assert.deepStrictEqual(calls, [ + { + text: undefined, + textGone: 'Loading', + selector: undefined, + selectorGone: '.spinner', + timeout: 10_000, + }, + ]) + const data = structuredOf<{ + page: number + found: boolean + target: string + timeout: number + }>(result) + assert.strictEqual(data.page, 7) + assert.strictEqual(data.found, true) + assert.strictEqual(data.target, 'text "Loading" to disappear') + assert.strictEqual(data.timeout, 10_000) + }) + + it('browser_run_code returns successful values', async () => { + const result = await executeTool( + browser_run_code, + { page: 2, code: 'return args.value', args: { value: 42 } }, + createToolContext({ + runCode: async ( + page: number, + code: string, + args?: Record, + ) => ({ + value: { page, code, args }, + }), + }), + AbortSignal.timeout(1_000), + ) + + assert.ok(!result.isError, textOf(result)) + assert.deepStrictEqual(structuredOf(result).value, { + page: 2, + code: 'return args.value', + args: { value: 42 }, + }) + }) + + it('browser_run_code reports code errors', async () => { + const result = await executeTool( + browser_run_code, + { page: 2, code: 'throw new Error("boom")' }, + createToolContext({ + runCode: async () => ({ error: 'Error: boom' }), + }), + AbortSignal.timeout(1_000), + ) + + assert.ok(result.isError, 'Expected browser_run_code to fail') + assert.ok(textOf(result).includes('Error: boom')) + }) +})