Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 29 additions & 18 deletions packages/core/lib/v3/handlers/v3AgentHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,11 @@ export class V3AgentHandler {
: allReasoning || "Task completed successfully";
}
}
await this.captureAndEmitScreenshot({
toolName: toolCall.toolName,
toolOutput: event.toolResults?.[i]?.output,
});

const mappedActions = mapToolResultToActions({
toolCallName: toolCall.toolName,
toolResult,
Expand All @@ -182,19 +187,6 @@ export class V3AgentHandler {
}
}
state.currentPageUrl = (await this.v3.context.awaitActivePage()).url();

// Capture screenshot after tool execution (only for evals)
if (process.env.EVALS === "true") {
try {
await this.captureAndEmitScreenshot();
} catch (e) {
this.logger({
category: "agent",
message: `Warning: Failed to capture screenshot: ${getErrorMessage(e)}`,
level: 1,
});
}
}
}

if (userCallback) {
Expand Down Expand Up @@ -489,13 +481,32 @@ export class V3AgentHandler {
}

/**
* Capture a screenshot and emit it via the event bus
* Capture a screenshot and emit it via the event bus.
* Handles both hybrid mode (uses existing screenshot from tool output) and DOM mode (captures fresh screenshot).
* Only runs when EVALS=true.
*/
private async captureAndEmitScreenshot(): Promise<void> {
private async captureAndEmitScreenshot(options: {
toolName: string;
toolOutput?: { base64?: string };
}): Promise<void> {
if (process.env.EVALS !== "true") {
return;
}

try {
const page = await this.v3.context.awaitActivePage();
const screenshot = await page.screenshot({ fullPage: false });
this.v3.bus.emit("agent_screensot_taken_event", screenshot);
let screenshot: Buffer;

if (this.mode === "hybrid") {
if (options.toolName !== "screenshot" || !options.toolOutput?.base64) {
return;
}
screenshot = Buffer.from(options.toolOutput.base64, "base64");
} else {
const page = await this.v3.context.awaitActivePage();
screenshot = await page.screenshot({ fullPage: false });
}

this.v3.bus.emit("agent_screenshot_taken_event", screenshot);
} catch (error) {
this.logger({
category: "agent",
Expand Down
2 changes: 1 addition & 1 deletion packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ export class V3CuaAgentHandler {
const page = await this.v3.context.awaitActivePage();
const base64Image = await page.screenshot({ fullPage: false });
// Emit screenshot event via the bus
this.v3.bus.emit("agent_screensot_taken_event", base64Image);
this.v3.bus.emit("agent_screenshot_taken_event", base64Image);
const currentUrl = page.url();
return await this.agentClient.captureScreenshot({
base64Image,
Expand Down
1 change: 0 additions & 1 deletion packages/evals/tasks/agent/alibaba_supplier_search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ export const alibaba_supplier_search: EvalFunction = async ({

// Start collecting screenshots throughout the agent's journey
const screenshotCollector = new ScreenshotCollector(v3, {
interval: 3000,
maxScreenshots: 15,
});
screenshotCollector.start();
Expand Down
38 changes: 27 additions & 11 deletions packages/evals/tasks/agent/all_recipes.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { V3Evaluator } from "@browserbasehq/stagehand";
import { EvalFunction } from "../../types/evals";
import { V3Evaluator } from "@browserbasehq/stagehand";
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";

export const all_recipes: EvalFunction = async ({
debugUrl,
Expand All @@ -11,22 +12,37 @@ export const all_recipes: EvalFunction = async ({
try {
const page = v3.context.pages()[0];
await page.goto("https://www.allrecipes.com/");
const evaluator = new V3Evaluator(v3);

const screenshotCollector = new ScreenshotCollector(v3, {
maxScreenshots: 15,
});
screenshotCollector.start();

const instruction =
"Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.";
const agentResult = await agent.execute({
instruction:
"Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
instruction,
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
});

const screenshots = await screenshotCollector.stop();

logger.log({
category: "evaluation",
message: `Collected ${screenshots.length} screenshots for evaluation`,
level: 1,
});

const evaluator = new V3Evaluator(v3);
const { evaluation, reasoning } = await evaluator.ask({
question: "Did the agent find a recipe for Beef Wellington",
question: `did the agent complete this task successfully? ${instruction}`,
screenshot: screenshots,
agentReasoning: agentResult.message,
});

logger.log(agentResult);
console.log(`reasoning: ${reasoning}`);

const success =
evaluation === "YES" &&
page.url() === "https://www.allrecipes.com/recipe/16899/beef-wellington/";
const success = evaluation === "YES";

if (!success) {
return {
Expand All @@ -37,17 +53,17 @@ export const all_recipes: EvalFunction = async ({
logs: logger.getLogs(),
};
}

return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
_success: false,
error,
message: errorMessage,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
Expand Down
1 change: 0 additions & 1 deletion packages/evals/tasks/agent/amazon_shoes_cart.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ export const amazon_shoes_cart: EvalFunction = async ({

// Start collecting screenshots throughout the agent's journey
const screenshotCollector = new ScreenshotCollector(v3, {
interval: 3000,
maxScreenshots: 15,
});
screenshotCollector.start();
Expand Down
36 changes: 27 additions & 9 deletions packages/evals/tasks/agent/apple_trade_in.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//this eval is expected to fail due to issues scrolling within the trade in dialog
import { EvalFunction } from "../../types/evals";
import { V3Evaluator } from "@browserbasehq/stagehand";
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";

export const apple_trade_in: EvalFunction = async ({
debugUrl,
Expand All @@ -12,20 +13,36 @@ export const apple_trade_in: EvalFunction = async ({
try {
const page = v3.context.pages()[0];
await page.goto("https://www.apple.com/shop/trade-in");
const evaluator = new V3Evaluator(v3);
await agent.execute({
instruction:
"Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",

const screenshotCollector = new ScreenshotCollector(v3, {
maxScreenshots: 15,
});
screenshotCollector.start();

const instruction =
"Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.";
const agentResult = await agent.execute({
instruction,
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
});

const screenshots = await screenshotCollector.stop();

logger.log({
category: "evaluation",
message: `Collected ${screenshots.length} screenshots for evaluation`,
level: 1,
});

const evaluator = new V3Evaluator(v3);
const { evaluation, reasoning } = await evaluator.ask({
question:
"Did the agent find the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website?",
screenshot: false,
answer: "360",
question: `did the agent complete this task successfully? ${instruction}`,
screenshot: screenshots,
agentReasoning: agentResult.message,
});

console.log(`reasoning: ${reasoning}`);

const success = evaluation === "YES";

if (!success) {
Expand All @@ -44,9 +61,10 @@ export const apple_trade_in: EvalFunction = async ({
logs: logger.getLogs(),
};
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
_success: false,
message: error.message,
message: errorMessage,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
Expand Down
40 changes: 28 additions & 12 deletions packages/evals/tasks/agent/apple_tv.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { EvalFunction } from "../../types/evals";
import { V3Evaluator } from "@browserbasehq/stagehand";
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";

export const apple_tv: EvalFunction = async ({
debugUrl,
Expand All @@ -12,27 +13,41 @@ export const apple_tv: EvalFunction = async ({
const page = v3.context.pages()[0];
await page.goto("https://www.apple.com/");

const screenshotCollector = new ScreenshotCollector(v3, {
maxScreenshots: 15,
});
screenshotCollector.start();

const instruction =
"Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.";
const agentResult = await agent.execute({
instruction:
"Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
instruction,
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
});

const screenshots = await screenshotCollector.stop();

logger.log({
category: "evaluation",
message: `Collected ${screenshots.length} screenshots for evaluation`,
level: 1,
});

const evaluator = new V3Evaluator(v3);
const result = await evaluator.ask({
question:
"did the agent find the height and width of the Apple TV 4K in its reasoning which is 1.2 and 3.66?",
answer: agentResult.message,
const { evaluation, reasoning } = await evaluator.ask({
question: `did the agent complete this task successfully? ${instruction}`,
screenshot: screenshots,
agentReasoning: agentResult.message,
});

const url = page.url();
const success =
result.evaluation === "YES" &&
url.includes("https://www.apple.com/apple-tv-4k/specs/");
console.log(`reasoning: ${reasoning}`);

const success = evaluation === "YES";

if (!success) {
return {
_success: false,
message: agentResult.message,
message: reasoning,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
Expand All @@ -45,9 +60,10 @@ export const apple_tv: EvalFunction = async ({
logs: logger.getLogs(),
};
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
_success: false,
message: error.message,
message: errorMessage,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
Expand Down
33 changes: 23 additions & 10 deletions packages/evals/tasks/agent/arxiv_gpt_report.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//agent often fails on this one,
import { EvalFunction } from "../../types/evals";
import { V3Evaluator } from "@browserbasehq/stagehand";
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";

export const arxiv_gpt_report: EvalFunction = async ({
debugUrl,
Expand All @@ -11,22 +12,33 @@ export const arxiv_gpt_report: EvalFunction = async ({
}) => {
try {
const page = v3.context.pages()[0];
const evaluator = new V3Evaluator(v3);
await page.goto("https://arxiv.org/");

await agent.execute({
instruction:
"Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
const screenshotCollector = new ScreenshotCollector(v3, {
maxScreenshots: 15,
});
screenshotCollector.start();

const instruction =
"Find the paper 'GPT-4 Technical Report', when was v3 submitted?";
const agentResult = await agent.execute({
instruction,
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 25,
});

// Mon, 27 Mar 2023 17:46:54 UTC
const screenshots = await screenshotCollector.stop();

logger.log({
category: "evaluation",
message: `Collected ${screenshots.length} screenshots for evaluation`,
level: 1,
});

const evaluator = new V3Evaluator(v3);
const { evaluation, reasoning } = await evaluator.ask({
question:
"Did the agent find the published paper 'GPT-4 Technical Report' and the date it was submitted?",
screenshot: false,
answer: "03-27-2023",
question: `did the agent complete this task successfully? ${instruction}`,
screenshot: screenshots,
agentReasoning: agentResult.message,
});

console.log(`reasoning: ${reasoning}`);
Expand All @@ -49,9 +61,10 @@ export const arxiv_gpt_report: EvalFunction = async ({
logs: logger.getLogs(),
};
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
_success: false,
message: error.message,
message: errorMessage,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
Expand Down
1 change: 0 additions & 1 deletion packages/evals/tasks/agent/columbia_tuition.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ export const columbia_tuition: EvalFunction = async ({

// Start collecting screenshots throughout the agent's journey
const screenshotCollector = new ScreenshotCollector(v3, {
interval: 3000,
maxScreenshots: 15,
});
screenshotCollector.start();
Expand Down
1 change: 0 additions & 1 deletion packages/evals/tasks/agent/flipkart_laptops.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ export const flipkart_laptops: EvalFunction = async ({

// Start collecting screenshots throughout the agent's journey
const screenshotCollector = new ScreenshotCollector(v3, {
interval: 3000,
maxScreenshots: 15,
});
screenshotCollector.start();
Expand Down
Loading