browserbase · tkattkat · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025
diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts
@@ -168,6 +168,11 @@ export class V3AgentHandler {
                 : allReasoning || "Task completed successfully";
             }
           }
+          await this.captureAndEmitScreenshot({
+            toolName: toolCall.toolName,
+            toolOutput: event.toolResults?.[i]?.output,
+          });
+
           const mappedActions = mapToolResultToActions({
             toolCallName: toolCall.toolName,
             toolResult,
@@ -182,19 +187,6 @@ export class V3AgentHandler {
           }
         }
         state.currentPageUrl = (await this.v3.context.awaitActivePage()).url();
-
-        // Capture screenshot after tool execution (only for evals)
-        if (process.env.EVALS === "true") {
-          try {
-            await this.captureAndEmitScreenshot();
-          } catch (e) {
-            this.logger({
-              category: "agent",
-              message: `Warning: Failed to capture screenshot: ${getErrorMessage(e)}`,
-              level: 1,
-            });
-          }
-        }
       }
 
       if (userCallback) {
@@ -489,13 +481,32 @@ export class V3AgentHandler {
   }
 
   /**
-   * Capture a screenshot and emit it via the event bus
+   * Capture a screenshot and emit it via the event bus.
+   * Handles both hybrid mode (uses existing screenshot from tool output) and DOM mode (captures fresh screenshot).
+   * Only runs when EVALS=true.
    */
-  private async captureAndEmitScreenshot(): Promise<void> {
+  private async captureAndEmitScreenshot(options: {
+    toolName: string;
+    toolOutput?: { base64?: string };
+  }): Promise<void> {
+    if (process.env.EVALS !== "true") {
+      return;
+    }
+
     try {
-      const page = await this.v3.context.awaitActivePage();
-      const screenshot = await page.screenshot({ fullPage: false });
-      this.v3.bus.emit("agent_screensot_taken_event", screenshot);
+      let screenshot: Buffer;
+
+      if (this.mode === "hybrid") {
+        if (options.toolName !== "screenshot" || !options.toolOutput?.base64) {
+          return;
+        }
+        screenshot = Buffer.from(options.toolOutput.base64, "base64");
+      } else {
+        const page = await this.v3.context.awaitActivePage();
+        screenshot = await page.screenshot({ fullPage: false });
+      }
+
+      this.v3.bus.emit("agent_screenshot_taken_event", screenshot);
     } catch (error) {
       this.logger({
         category: "agent",

diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
@@ -557,7 +557,7 @@ export class V3CuaAgentHandler {
       const page = await this.v3.context.awaitActivePage();
       const base64Image = await page.screenshot({ fullPage: false });
       // Emit screenshot event via the bus
-      this.v3.bus.emit("agent_screensot_taken_event", base64Image);
+      this.v3.bus.emit("agent_screenshot_taken_event", base64Image);
       const currentUrl = page.url();
       return await this.agentClient.captureScreenshot({
         base64Image,

diff --git a/packages/evals/tasks/agent/alibaba_supplier_search.ts b/packages/evals/tasks/agent/alibaba_supplier_search.ts
@@ -15,7 +15,6 @@ export const alibaba_supplier_search: EvalFunction = async ({
 
     // Start collecting screenshots throughout the agent's journey
     const screenshotCollector = new ScreenshotCollector(v3, {
-      interval: 3000,
       maxScreenshots: 15,
     });
     screenshotCollector.start();

diff --git a/packages/evals/tasks/agent/all_recipes.ts b/packages/evals/tasks/agent/all_recipes.ts
@@ -1,5 +1,6 @@
-import { V3Evaluator } from "@browserbasehq/stagehand";
 import { EvalFunction } from "../../types/evals";
+import { V3Evaluator } from "@browserbasehq/stagehand";
+import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
 
 export const all_recipes: EvalFunction = async ({
   debugUrl,
@@ -11,22 +12,37 @@ export const all_recipes: EvalFunction = async ({
   try {
     const page = v3.context.pages()[0];
     await page.goto("https://www.allrecipes.com/");
-    const evaluator = new V3Evaluator(v3);
+
+    const screenshotCollector = new ScreenshotCollector(v3, {
+      maxScreenshots: 15,
+    });
+    screenshotCollector.start();
+
+    const instruction =
+      "Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.";
     const agentResult = await agent.execute({
-      instruction:
-        "Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
+      instruction,
       maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
     });
 
+    const screenshots = await screenshotCollector.stop();
+
+    logger.log({
+      category: "evaluation",
+      message: `Collected ${screenshots.length} screenshots for evaluation`,
+      level: 1,
+    });
+
+    const evaluator = new V3Evaluator(v3);
     const { evaluation, reasoning } = await evaluator.ask({
-      question: "Did the agent find a recipe for Beef Wellington",
+      question: `did the agent complete this task successfully? ${instruction}`,
+      screenshot: screenshots,
+      agentReasoning: agentResult.message,
     });
 
-    logger.log(agentResult);
+    console.log(`reasoning: ${reasoning}`);
 
-    const success =
-      evaluation === "YES" &&
-      page.url() === "https://www.allrecipes.com/recipe/16899/beef-wellington/";
+    const success = evaluation === "YES";
 
     if (!success) {
       return {
@@ -37,17 +53,17 @@ export const all_recipes: EvalFunction = async ({
         logs: logger.getLogs(),
       };
     }
-
     return {
       _success: true,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),
     };
   } catch (error) {
+    const errorMessage = error instanceof Error ? error.message : String(error);
     return {
       _success: false,
-      error,
+      message: errorMessage,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),

diff --git a/packages/evals/tasks/agent/amazon_shoes_cart.ts b/packages/evals/tasks/agent/amazon_shoes_cart.ts
@@ -15,7 +15,6 @@ export const amazon_shoes_cart: EvalFunction = async ({
 
     // Start collecting screenshots throughout the agent's journey
     const screenshotCollector = new ScreenshotCollector(v3, {
-      interval: 3000,
       maxScreenshots: 15,
     });
     screenshotCollector.start();

diff --git a/packages/evals/tasks/agent/apple_trade_in.ts b/packages/evals/tasks/agent/apple_trade_in.ts
@@ -1,6 +1,7 @@
 //this eval is expected to fail due to issues scrolling within the trade in dialog
 import { EvalFunction } from "../../types/evals";
 import { V3Evaluator } from "@browserbasehq/stagehand";
+import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
 
 export const apple_trade_in: EvalFunction = async ({
   debugUrl,
@@ -12,20 +13,36 @@ export const apple_trade_in: EvalFunction = async ({
   try {
     const page = v3.context.pages()[0];
     await page.goto("https://www.apple.com/shop/trade-in");
-    const evaluator = new V3Evaluator(v3);
-    await agent.execute({
-      instruction:
-        "Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
+
+    const screenshotCollector = new ScreenshotCollector(v3, {
+      maxScreenshots: 15,
+    });
+    screenshotCollector.start();
+
+    const instruction =
+      "Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.";
+    const agentResult = await agent.execute({
+      instruction,
       maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
     });
 
+    const screenshots = await screenshotCollector.stop();
+
+    logger.log({
+      category: "evaluation",
+      message: `Collected ${screenshots.length} screenshots for evaluation`,
+      level: 1,
+    });
+
+    const evaluator = new V3Evaluator(v3);
     const { evaluation, reasoning } = await evaluator.ask({
-      question:
-        "Did the agent find the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website?",
-      screenshot: false,
-      answer: "360",
+      question: `did the agent complete this task successfully? ${instruction}`,
+      screenshot: screenshots,
+      agentReasoning: agentResult.message,
     });
 
+    console.log(`reasoning: ${reasoning}`);
+
     const success = evaluation === "YES";
 
     if (!success) {
@@ -44,9 +61,10 @@ export const apple_trade_in: EvalFunction = async ({
       logs: logger.getLogs(),
     };
   } catch (error) {
+    const errorMessage = error instanceof Error ? error.message : String(error);
     return {
       _success: false,
-      message: error.message,
+      message: errorMessage,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),

diff --git a/packages/evals/tasks/agent/apple_tv.ts b/packages/evals/tasks/agent/apple_tv.ts
@@ -1,5 +1,6 @@
 import { EvalFunction } from "../../types/evals";
 import { V3Evaluator } from "@browserbasehq/stagehand";
+import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
 
 export const apple_tv: EvalFunction = async ({
   debugUrl,
@@ -12,27 +13,41 @@ export const apple_tv: EvalFunction = async ({
     const page = v3.context.pages()[0];
     await page.goto("https://www.apple.com/");
 
+    const screenshotCollector = new ScreenshotCollector(v3, {
+      maxScreenshots: 15,
+    });
+    screenshotCollector.start();
+
+    const instruction =
+      "Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.";
     const agentResult = await agent.execute({
-      instruction:
-        "Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
+      instruction,
       maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
     });
 
+    const screenshots = await screenshotCollector.stop();
+
+    logger.log({
+      category: "evaluation",
+      message: `Collected ${screenshots.length} screenshots for evaluation`,
+      level: 1,
+    });
+
     const evaluator = new V3Evaluator(v3);
-    const result = await evaluator.ask({
-      question:
-        "did the agent find the height and width of the Apple TV 4K in its reasoning which is 1.2 and 3.66?",
-      answer: agentResult.message,
+    const { evaluation, reasoning } = await evaluator.ask({
+      question: `did the agent complete this task successfully? ${instruction}`,
+      screenshot: screenshots,
+      agentReasoning: agentResult.message,
     });
 
-    const url = page.url();
-    const success =
-      result.evaluation === "YES" &&
-      url.includes("https://www.apple.com/apple-tv-4k/specs/");
+    console.log(`reasoning: ${reasoning}`);
+
+    const success = evaluation === "YES";
+
     if (!success) {
       return {
         _success: false,
-        message: agentResult.message,
+        message: reasoning,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),
@@ -45,9 +60,10 @@ export const apple_tv: EvalFunction = async ({
       logs: logger.getLogs(),
     };
   } catch (error) {
+    const errorMessage = error instanceof Error ? error.message : String(error);
     return {
       _success: false,
-      message: error.message,
+      message: errorMessage,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),

diff --git a/packages/evals/tasks/agent/arxiv_gpt_report.ts b/packages/evals/tasks/agent/arxiv_gpt_report.ts
@@ -1,6 +1,7 @@
 //agent often fails on this one,
 import { EvalFunction } from "../../types/evals";
 import { V3Evaluator } from "@browserbasehq/stagehand";
+import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
 
 export const arxiv_gpt_report: EvalFunction = async ({
   debugUrl,
@@ -11,22 +12,33 @@ export const arxiv_gpt_report: EvalFunction = async ({
 }) => {
   try {
     const page = v3.context.pages()[0];
-    const evaluator = new V3Evaluator(v3);
     await page.goto("https://arxiv.org/");
 
-    await agent.execute({
-      instruction:
-        "Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
+    const screenshotCollector = new ScreenshotCollector(v3, {
+      maxScreenshots: 15,
+    });
+    screenshotCollector.start();
+
+    const instruction =
+      "Find the paper 'GPT-4 Technical Report', when was v3 submitted?";
+    const agentResult = await agent.execute({
+      instruction,
       maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 25,
     });
 
-    // Mon, 27 Mar 2023 17:46:54 UTC
+    const screenshots = await screenshotCollector.stop();
 
+    logger.log({
+      category: "evaluation",
+      message: `Collected ${screenshots.length} screenshots for evaluation`,
+      level: 1,
+    });
+
+    const evaluator = new V3Evaluator(v3);
     const { evaluation, reasoning } = await evaluator.ask({
-      question:
-        "Did the agent find the published paper 'GPT-4 Technical Report' and the date it was submitted?",
-      screenshot: false,
-      answer: "03-27-2023",
+      question: `did the agent complete this task successfully? ${instruction}`,
+      screenshot: screenshots,
+      agentReasoning: agentResult.message,
     });
 
     console.log(`reasoning: ${reasoning}`);
@@ -49,9 +61,10 @@ export const arxiv_gpt_report: EvalFunction = async ({
       logs: logger.getLogs(),
     };
   } catch (error) {
+    const errorMessage = error instanceof Error ? error.message : String(error);
     return {
       _success: false,
-      message: error.message,
+      message: errorMessage,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),

diff --git a/packages/evals/tasks/agent/columbia_tuition.ts b/packages/evals/tasks/agent/columbia_tuition.ts
@@ -15,7 +15,6 @@ export const columbia_tuition: EvalFunction = async ({
 
     // Start collecting screenshots throughout the agent's journey
     const screenshotCollector = new ScreenshotCollector(v3, {
-      interval: 3000,
       maxScreenshots: 15,
     });
     screenshotCollector.start();

diff --git a/packages/evals/tasks/agent/flipkart_laptops.ts b/packages/evals/tasks/agent/flipkart_laptops.ts
@@ -15,7 +15,6 @@ export const flipkart_laptops: EvalFunction = async ({
 
     // Start collecting screenshots throughout the agent's journey
     const screenshotCollector = new ScreenshotCollector(v3, {
-      interval: 3000,
       maxScreenshots: 15,
     });
     screenshotCollector.start();