diff --git a/lib/handlers/operatorHandler.ts b/lib/handlers/operatorHandler.ts index ec5a7d5e5..ecbead8ea 100644 --- a/lib/handlers/operatorHandler.ts +++ b/lib/handlers/operatorHandler.ts @@ -1,242 +1,1135 @@ -import { AgentAction, AgentExecuteOptions, AgentResult } from "@/types/agent"; +/* eslint-disable */ +import { AgentExecuteOptions, AgentResult } from "@/types/agent"; import { LogLine } from "@/types/log"; import { OperatorResponse, operatorResponseSchema, - OperatorSummary, operatorSummarySchema, } from "@/types/operator"; -import { LLMParsedResponse } from "../inference"; -import { ChatMessage, LLMClient } from "../llm/LLMClient"; -import { buildOperatorSystemPrompt } from "../prompt"; +import { LLMClient } from "../llm/LLMClient"; +import { PLANNER_PROMPT } from "../prompt"; import { StagehandPage } from "../StagehandPage"; -import { ObserveResult } from "@/types/stagehand"; -import { - StagehandError, - StagehandMissingArgumentError, -} from "@/types/stagehandErrors"; +import { StagehandError } from "@/types/stagehandErrors"; +import { CoreMessage, LanguageModelV1 } from "ai"; +import { LLMProvider } from "../llm/LLMProvider"; +import { getAISDKLanguageModel } from "../llm/LLMProvider"; +import { google } from "@ai-sdk/google"; +import { z } from "zod"; +import { WORKER_PROMPT } from "../prompt"; + +const PlannerLLM = google("gemini-2.5-flash-preview-04-17"); +const WorkerLLM = google("gemini-2.0-flash"); + +export type TaskStatus = "PENDING" | "IN_PROGRESS" | "DONE" | "FAILED"; + +// Define the subtask interface +export interface Subtask { + id: string; + description: string; + goal: string; + dependencies?: string[]; // IDs of subtasks that must be completed before this one + status: TaskStatus; +} + +// Define the plan interface +export interface TaskPlan { + summary: string; + subtasks: Subtask[]; +} + +export interface TaskProgress { + total: number; + completed: number; + failed: number; + inProgress: number; + pending: number; +} + +// Define the step interface (similar to the existing Step type) +export interface BrowserStep { + text: string; + reasoning: string; + tool: // | "GOTO" + | "ACT" + | "EXTRACT" + | "OBSERVE" + | "CLOSE" + | "WAIT" + | "NAVBACK" + | "SCREENSHOT" + | "DONE" + | "FAIL" + | "GET_URL"; + instruction: string; + stepNumber?: number; +} + +// Worker result interface +export interface WorkerResult { + status: "DONE" | "FAILED"; + steps: BrowserStep[]; + extraction?: any; + error?: string; + retryCount: number; +} export class StagehandOperatorHandler { private stagehandPage: StagehandPage; - private logger: (message: LogLine) => void; + private readonly logger: (logLine: LogLine) => void; private llmClient: LLMClient; - private messages: ChatMessage[]; - + private llmProvider: LLMProvider; + private messages: CoreMessage[]; + private model: LanguageModelV1 | LLMClient; + private modelName: string; constructor( stagehandPage: StagehandPage, logger: (message: LogLine) => void, llmClient: LLMClient, + llmProvider: LLMProvider, + modelName: string, ) { this.stagehandPage = stagehandPage; this.logger = logger; this.llmClient = llmClient; - } - - public async execute( - instructionOrOptions: string | AgentExecuteOptions, - ): Promise { - const options = - typeof instructionOrOptions === "string" - ? { instruction: instructionOrOptions } - : instructionOrOptions; - - this.messages = [buildOperatorSystemPrompt(options.instruction)]; - let completed = false; - let currentStep = 0; - const maxSteps = options.maxSteps || 10; - const actions: AgentAction[] = []; + this.llmProvider = llmProvider; + this.modelName = modelName; + const firstSlashIndex = this.modelName.indexOf("/"); + const subProvider = this.modelName.substring(0, firstSlashIndex); + const subModelName = this.modelName.substring(firstSlashIndex + 1); - while (!completed && currentStep < maxSteps) { - const url = this.stagehandPage.page.url(); + const languageModel = getAISDKLanguageModel( + subProvider, + subModelName, + this.llmClient.clientOptions?.apiKey, + ); + this.model = languageModel; + } - if (!url || url === "about:blank") { - this.messages.push({ + public async plan(goal: string): Promise { + // Generate a plan using the LLM + // TODO add animation ... + this.logger({ + category: "operator", + message: `Generating plan`, + level: 1, + }); + const planResult = await this.llmClient.generateObject({ + model: PlannerLLM, + schema: z.object({ + summary: z.string().describe("A summary of the overall task plan"), + subtasks: z + .array( + z.object({ + description: z + .string() + .describe( + "A clear description of what this subtask should accomplish", + ), + goal: z + .string() + .describe("The specific goal this subtask aims to achieve"), + dependencies: z + .array(z.number()) + .optional() + .describe( + "Array of subtask indices (0-based) that must be completed before this subtask can begin", + ), + }), + ) + .min(1) + .describe("An array of subtasks to accomplish the overall goal"), + }), + messages: [ + { + role: "system", + content: PLANNER_PROMPT, + }, + { role: "user", content: [ { type: "text", - text: "No page is currently loaded. The first step should be a 'goto' action to navigate to a URL.", + text: `I need a plan for accomplishing this task: "${goal}. You're currently on this page: ${this.stagehandPage.page.url()}"`, }, ], + }, + ], + }); + const subtasks = planResult.object.subtasks.map((subtask, index) => ({ + id: `subtask-${index + 1}`, + description: subtask.description, + goal: subtask.goal, + dependencies: subtask.dependencies?.map( + (depIndex) => `subtask-${depIndex + 1}`, + ), + status: "PENDING" as const, + })); + + const plan = { + summary: planResult.object.summary, + subtasks, + }; + + return plan; + } + + public async executeSubtask( + subtask: Subtask, + overallGoal: string, + taskPlanContext?: any, + ): Promise { + this.logger({ + category: "operator", + message: `Executing subtask ${subtask.id}: ${subtask.goal}`, + level: 1, + }); + + const MAX_STEPS = 15; + const MAX_RETRIES = 3; + const MAX_HISTORY = 5; + + let steps: BrowserStep[] = []; + let extraction: any = null; + let retryCount = 0; + let lastError: Error | null = null; + let currentScreenshot: string | null = null; + let currentPageText: string | null = null; + const recentActionHistory: Array<{ tool: string; instruction: string }> = + []; + let isSubtaskComplete = false; + let currentUrl = "unknown"; + let previousExtraction: any = null; // Potentially pass this in if needed from dependencies + + try { + // Get initial state + currentUrl = this.stagehandPage.page.url() || "unknown"; + this.logger({ + category: "operator", + message: `Starting URL: ${currentUrl}`, + level: 2, + }); + + try { + this.logger({ + category: "operator", + message: `Capturing initial screenshot for subtask ${subtask.id}`, + level: 2, }); - } else { - const screenshot = await this.stagehandPage.page.screenshot({ - type: "png", - fullPage: false, + currentScreenshot = await this._performBrowserAction({ + text: "Capturing initial screenshot", + reasoning: "Need visual context of starting state", + tool: "SCREENSHOT", + instruction: "", + }); + this.logger({ + category: "operator", + message: `Initial screenshot captured`, + level: 2, }); + try { + const textResult = await this.stagehandPage.page.extract(); + currentPageText = textResult.page_text; + this.logger({ + category: "operator", + message: `Extracted initial page text content`, + level: 2, + }); + } catch (textExtractError) { + this.logger({ + category: "operator", + message: `Failed to extract initial page text: ${textExtractError}`, + level: 0, + }); + currentPageText = null; + } + } catch (e) { + this.logger({ + category: "operator", + message: `Failed to capture initial screenshot: ${e}`, + level: 0, + }); + lastError = e instanceof Error ? e : new Error(String(e)); + } + + // Main execution loop + while ( + !isSubtaskComplete && + steps.length < MAX_STEPS && + retryCount < MAX_RETRIES + ) { + try { + const nextStep = await this._generateNextStepInstruction({ + subtaskId: subtask.id, + overallGoal, + subtaskGoal: subtask.goal, + subtaskDescription: subtask.description, + taskPlanContext, // Pass the context + previousSteps: steps, + currentUrl, + previousExtraction, + screenshot: currentScreenshot, + currentPageText: currentPageText, // Pass textual content + }); + + this.logger({ + category: "operator", + message: `[Subtask ${subtask.id}] Step ${steps.length + 1}: ${nextStep.tool} - ${nextStep.instruction.substring(0, 100)}${nextStep.instruction.length > 100 ? "..." : ""}`, + level: 2, + }); - const base64Image = screenshot.toString("base64"); + // 2. Check for explicit DONE/FAIL + if (nextStep.tool === "DONE") { + this.logger({ + category: "operator", + message: `Subtask ${subtask.id} marked DONE by agent: ${nextStep.instruction}`, + level: 1, + }); + steps.push(nextStep); + return { + status: "DONE", + steps, + extraction, + retryCount, + }; + } + if (nextStep.tool === "FAIL") { + this.logger({ + category: "operator", + message: `Subtask ${subtask.id} marked FAIL by agent: ${nextStep.instruction}`, + level: 0, + }); + steps.push(nextStep); + return { + status: "FAILED", + steps, + error: nextStep.instruction, + retryCount, + }; + } - let messageText = `Here is a screenshot of the current page (URL: ${url}):`; + // 3. Check for loops + if (this._isRepeatingAction(nextStep, recentActionHistory)) { + this.logger({ + category: "operator", + message: `[Subtask ${subtask.id}] Detected potential loop on action: ${nextStep.tool}. Incrementing retry count.`, + level: 0, + }); + retryCount++; + if (retryCount >= MAX_RETRIES) { + throw new StagehandError( + `Failed due to repeating action (${nextStep.tool}) ${retryCount} times.`, + ); + } + // TODO: Consider adding wait or alternative strategy before continuing + this.logger({ + category: "operator", + message: `Retrying (${retryCount}/${MAX_RETRIES})...`, + level: 2, + }); + // Capture fresh screenshot to potentially break loop + try { + currentScreenshot = await this._performBrowserAction({ + tool: "SCREENSHOT", + instruction: "", + text: "Refreshing screenshot for loop retry", + reasoning: "Get updated visual context", + }); + } catch (screenshotError) { + this.logger({ + category: "operator", + message: `Failed to capture fresh screenshot during loop retry: ${screenshotError}`, + level: 0, + }); + } + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait before retry + // ---> Also extract text content after loop retry screenshot + try { + const textResult = await this.stagehandPage.page.extract(); + currentPageText = textResult.page_text; + this.logger({ + category: "operator", + message: `Extracted page text content after loop retry screenshot`, + level: 2, + }); + } catch (textExtractError) { + this.logger({ + category: "operator", + message: `Failed to extract page text after loop retry screenshot: ${textExtractError}`, + level: 0, + }); + currentPageText = null; + } + continue; + } - messageText = `Previous actions were: ${actions - .map((action) => { - let result: string = ""; - if (action.type === "act") { - const args = action.playwrightArguments as ObserveResult; - result = `Performed a "${args.method}" action ${args.arguments.length > 0 ? `with arguments: ${args.arguments.map((arg) => `"${arg}"`).join(", ")}` : ""} on "${args.description}"`; - } else if (action.type === "extract") { - result = `Extracted data: ${action.extractionResult}`; + // Add step to history *before* execution (to track attempts) + steps.push(nextStep); + + // 4. Execute the browser step + const result = await this._performBrowserAction(nextStep); + lastError = null; + + // 5. Handle results/updates + if (nextStep.tool === "EXTRACT") { + extraction = result; + previousExtraction = result; + this.logger({ + category: "operator", + message: `Extraction result: ${JSON.stringify(extraction)}`, + level: 2, + }); + } + // if (nextStep.tool === "GOTO" || nextStep.tool === "NAVBACK") { + if (nextStep.tool === "NAVBACK") { + currentUrl = this.stagehandPage.page.url() || "unknown"; // Update URL after navigation + this.logger({ + category: "operator", + message: `URL updated to: ${currentUrl}`, + level: 2, + }); + } + // Always get a new screenshot unless the action was a screenshot + if (nextStep.tool !== "SCREENSHOT") { + try { + currentScreenshot = await this._performBrowserAction({ + tool: "SCREENSHOT", + instruction: "", + text: "Capturing screenshot after action", + reasoning: "Get updated visual context", + }); + this.logger({ + category: "operator", + message: `Captured screenshot after ${nextStep.tool}`, + level: 2, + }); + try { + const textResult = await this.stagehandPage.page.extract(); + currentPageText = textResult.page_text; + this.logger({ + category: "operator", + message: `Extracted page text content after ${nextStep.tool}`, + level: 2, + }); + } catch (textExtractError) { + this.logger({ + category: "operator", + message: `Failed to extract page text after ${nextStep.tool}: ${textExtractError}`, + level: 0, + }); + currentPageText = null; + } + } catch (screenshotError) { + this.logger({ + category: "operator", + message: `Failed to capture screenshot after ${nextStep.tool}: ${screenshotError}`, + level: 0, + }); + } + } else { + currentScreenshot = result as string; // Use the result of the screenshot action + this.logger({ + category: "operator", + message: `Updated screenshot from SCREENSHOT action`, + level: 2, + }); + try { + const textResult = await this.stagehandPage.page.extract(); + currentPageText = textResult.page_text; + this.logger({ + category: "operator", + message: `Extracted page text content after SCREENSHOT action`, + level: 2, + }); + } catch (textExtractError) { + this.logger({ + category: "operator", + message: `Failed to extract page text after SCREENSHOT action: ${textExtractError}`, + level: 0, + }); + currentPageText = null; } - return `[${action.type}] ${action.reasoning}. Result: ${result}`; - }) - .join("\n")}\n\n${messageText}`; + } - this.messages.push({ - role: "user", - content: [ - { - type: "text", - text: messageText, - }, - this.llmClient.type === "anthropic" - ? { - type: "image", - source: { - type: "base64", - media_type: "image/png", - data: base64Image, - }, - text: "the screenshot of the current page", - } - : { - type: "image_url", - image_url: { url: `data:image/png;base64,${base64Image}` }, - }, - ], - }); - } + // TODO: Add logic to determine if subtask is implicitly complete based on state/result? + // isSubtaskComplete = ... + } catch (error) { + lastError = error instanceof Error ? error : new Error(String(error)); + this.logger({ + category: "operator", + message: `[Subtask ${subtask.id}] Error executing step ${steps.length}: ${lastError.message}`, + level: 0, + }); + retryCount++; - const result = await this.getNextStep(currentStep); + if (retryCount >= MAX_RETRIES) { + this.logger({ + category: "operator", + message: `[Subtask ${subtask.id}] Failed after ${retryCount} retries.`, + level: 0, + }); + // Add final FAIL step + steps.push({ + text: "Marking subtask failed", + reasoning: "Exceeded max retries", + tool: "FAIL", + instruction: lastError.message, + }); + return { + status: "FAILED", + steps, + error: lastError.message, + retryCount, + }; + } - if (result.method === "close") { - completed = true; + this.logger({ + category: "operator", + message: `Retrying (${retryCount}/${MAX_RETRIES})...`, + level: 2, + }); + // Capture screenshot before retry + try { + currentScreenshot = await this._performBrowserAction({ + tool: "SCREENSHOT", + instruction: "", + text: "Capturing screenshot for error retry", + reasoning: "Get updated visual context", + }); + } catch (screenshotError) { + this.logger({ + category: "operator", + message: `Failed to capture screenshot during error retry: ${screenshotError}`, + level: 0, + }); + } + await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait before retry + try { + const textResult = await this.stagehandPage.page.extract(); + currentPageText = textResult.page_text; + this.logger({ + category: "operator", + message: `Extracted page text content after error retry screenshot`, + level: 2, + }); + } catch (textExtractError) { + this.logger({ + category: "operator", + message: `Failed to extract page text after error retry screenshot: ${textExtractError}`, + level: 0, + }); + currentPageText = null; + } + } } - let playwrightArguments: ObserveResult | undefined; - if (result.method === "act") { - [playwrightArguments] = await this.stagehandPage.page.observe( - result.parameters, - ); + // Loop finished - Determine final status + if (isSubtaskComplete) { + // It should have returned earlier via DONE tool, but as a fallback + this.logger({ + category: "operator", + message: `[Subtask ${subtask.id}] Completed successfully (end of loop).`, + level: 1, + }); + if (steps[steps.length - 1]?.tool !== "DONE") { + steps.push({ + text: "Marking subtask complete", + reasoning: "Reached end of execution loop successfully", + tool: "DONE", + instruction: "Subtask completed", + }); + } + return { status: "DONE", steps, extraction, retryCount }; + } else if (steps.length >= MAX_STEPS) { + this.logger({ + category: "operator", + message: `[Subtask ${subtask.id}] Failed due to exceeding max steps (${MAX_STEPS}).`, + level: 0, + }); + if (steps[steps.length - 1]?.tool !== "FAIL") { + steps.push({ + text: "Marking subtask failed", + reasoning: "Exceeded max steps", + tool: "FAIL", + instruction: `Reached step limit (${MAX_STEPS})`, + }); + } + return { + status: "FAILED", + steps, + error: `Exceeded maximum steps (${MAX_STEPS})`, + retryCount, + }; + } else { + // Loop ended due to retries, should have been handled in catch block + this.logger({ + category: "operator", + message: `[Subtask ${subtask.id}] Loop ended unexpectedly. Assuming failure.`, + level: 0, + }); + if (steps[steps.length - 1]?.tool !== "FAIL") { + steps.push({ + text: "Marking subtask failed", + reasoning: "Execution loop ended unexpectedly after retries", + tool: "FAIL", + instruction: lastError?.message ?? "Unknown error after retries", + }); + } + return { + status: "FAILED", + steps, + error: lastError?.message ?? "Unknown error after retries", + retryCount, + }; } - let extractionResult: unknown | undefined; - if (result.method === "extract") { - extractionResult = await this.stagehandPage.page.extract( - result.parameters, - ); + } catch (fatalError) { + // Catch errors during initial setup or other unexpected fatal issues + lastError = + fatalError instanceof Error + ? fatalError + : new Error(String(fatalError)); + this.logger({ + category: "operator", + message: `[Subtask ${subtask.id}] Fatal error during execution: ${lastError.message}`, + level: 0, + }); + if (steps[steps.length - 1]?.tool !== "FAIL") { + steps.push({ + text: "Marking subtask failed", + reasoning: "Fatal error during execution", + tool: "FAIL", + instruction: lastError.message, + }); } + return { + status: "FAILED", + steps, + error: lastError.message, + retryCount, + }; + } + } + + public async execute( + instructionOrOptions: string | AgentExecuteOptions, + ): Promise { + const options = + typeof instructionOrOptions === "string" + ? { instruction: instructionOrOptions } + : instructionOrOptions; - await this.executeAction(result, playwrightArguments, extractionResult); + this.logger({ + category: "operator", + message: `Starting task execution for: '${options.instruction}'`, + level: 1, + }); + + // 1. Generate the plan + let plan: TaskPlan; + try { + plan = await this.plan(options.instruction); + this.logger({ + category: "operator", + message: `Generated plan: ${JSON.stringify(plan, null, 2)}`, + level: 2, // Debug level for full plan + }); + } catch (error) { + this.logger({ + category: "operator", + message: `Failed to generate plan: ${error}`, + level: 0, + }); + return { + success: false, + message: `Failed to generate plan: ${error instanceof Error ? error.message : String(error)}`, + actions: [], + completed: false, + }; + } + + // 2. Execute subtasks sequentially (basic implementation) + let overallSuccess = true; + let finalMessage = `Task execution initiated for: ${options.instruction}`; + const executedSubtaskResults: { + subtaskId: string; + result: WorkerResult; + }[] = []; + // Simple state for passing extractions (can be made more robust) + let lastExtractionResult: any = null; + + for (const subtask of plan.subtasks) { + // TODO: Implement dependency checking here if needed + // For now, execute sequentially + if (subtask.status !== "PENDING") { + this.logger({ + category: "operator", + message: `Skipping subtask ${subtask.id} with status ${subtask.status}`, + level: 1, + }); + continue; + } - actions.push({ - type: result.method, - reasoning: result.reasoning, - taskCompleted: result.taskComplete, - parameters: result.parameters, - playwrightArguments, - extractionResult, + this.logger({ + category: "operator", + message: `Executing subtask ${subtask.id}: ${subtask.goal}`, + level: 1, }); + subtask.status = "IN_PROGRESS"; - currentStep++; + // Prepare context (can be expanded) + const taskPlanContext = { + planDescription: plan.summary, + // TODO: Add position, total, other subtasks if needed by prompt + }; + + try { + // Pass previous extraction result if available + const subtaskResult = await this.executeSubtask( + subtask, + options.instruction, + taskPlanContext /*, lastExtractionResult */, + ); // Pass extraction if implementing state transfer + executedSubtaskResults.push({ + subtaskId: subtask.id, + result: subtaskResult, + }); + + if (subtaskResult.status === "DONE") { + subtask.status = "DONE"; + this.logger({ + category: "operator", + message: `Subtask ${subtask.id} completed successfully.`, + level: 1, + }); + // Update last extraction result if present + if (subtaskResult.extraction) { + lastExtractionResult = subtaskResult.extraction; + } + } else { + // status === "FAILED" + subtask.status = "FAILED"; + overallSuccess = false; + finalMessage = `Task failed during subtask ${subtask.id}: ${subtaskResult.error || "Unknown error"}`; + this.logger({ + category: "operator", + message: `Subtask ${subtask.id} failed: ${subtaskResult.error}`, + level: 0, + }); + // Optional: Stop execution on first failure + this.logger({ + category: "operator", + message: `Stopping task execution due to subtask failure.`, + level: 0, + }); + break; + } + } catch (error) { + subtask.status = "FAILED"; + overallSuccess = false; + finalMessage = `Task failed during subtask ${subtask.id} execution: ${error instanceof Error ? error.message : String(error)}`; + this.logger({ + category: "operator", + message: `Fatal error during subtask ${subtask.id} execution: ${error}`, + level: 0, + }); + // Stop execution on fatal error + break; + } + } + + // 3. Determine final result + if (overallSuccess) { + const allDone = plan.subtasks.every((st) => st.status === "DONE"); + if (allDone) { + finalMessage = `Task completed successfully: ${plan.summary}`; + this.logger({ + category: "operator", + message: `All subtasks completed successfully.`, + level: 1, + }); + } else { + finalMessage = `Task finished, but some subtasks may not have run or completed.`; + this.logger({ + category: "operator", + message: `Task finished, but not all subtasks reached DONE status.`, + level: 1, + }); + overallSuccess = false; // Mark as not fully successful if not all are DONE + } } + // Adapt the AgentResult structure - actions might represent subtask results now + // For now, return a simplified actions array or potentially the detailed results. + // Let's return the summary message for now. return { - success: true, - message: await this.getSummary(options.instruction), - actions, - completed: actions[actions.length - 1].taskCompleted as boolean, + success: overallSuccess, + message: finalMessage, + // actions: executedSubtaskResults, // Or adapt AgentAction type + actions: [], // Placeholder - requires defining how subtask results map to AgentAction + completed: overallSuccess, // Assuming overallSuccess implies completion for now + // Potentially add final extraction result here + // extraction: lastExtractionResult }; } - private async getNextStep(currentStep: number): Promise { - const { data: response } = - (await this.llmClient.createChatCompletion({ - options: { - messages: this.messages, - response_model: { - name: "operatorResponseSchema", - schema: operatorResponseSchema, - }, - requestId: `operator-step-${currentStep}`, - }, - logger: this.logger, - })) as LLMParsedResponse; + private async _performBrowserAction(step: BrowserStep): Promise { + this.logger({ + category: "operator", + message: `[Subtask ${step.stepNumber}] Executing: ${step.tool} - ${step.instruction.substring(0, 100)}`, + level: 1, + }); // Added logging + const page = this.stagehandPage.page; - return response; - } + switch (step.tool) { + // case "GOTO": + // await page.goto(step.instruction, { + // waitUntil: "commit", // Match original code + // timeout: 60000, // Match original code + // }); + // // Ensure DOM settles after navigation + // await this.stagehandPage._waitForSettledDom(); + // return null; // No specific result for GOTO - private async getSummary(goal: string): Promise { - const { data: response } = - (await this.llmClient.createChatCompletion({ - options: { - messages: [ - ...this.messages, - { - role: "user", - content: [ - { - type: "text", - text: `Now use the steps taken to answer the original instruction of ${goal}.`, - }, - ], - }, - ], - response_model: { - name: "operatorSummarySchema", - schema: operatorSummarySchema, - }, - requestId: "operator-summary", - }, - logger: this.logger, - })) as LLMParsedResponse; + case "ACT": + // Pass instruction string directly, remove unsupported slowDomBasedAct + const actResult = await page.act(step.instruction); + await this.stagehandPage._waitForSettledDom(); + return actResult; // Return the result of the ACT action - return response.answer; - } - private async executeAction( - action: OperatorResponse, - playwrightArguments?: ObserveResult, - extractionResult?: unknown, - ): Promise { - const { method, parameters } = action; - const page = this.stagehandPage.page; + case "EXTRACT": { + this.logger({ + category: "operator", + message: `Extracting: ${step.instruction}`, + level: 2, + }); + // Assuming extract returns an object with an extraction property based on original code + const result = await page.extract(step.instruction); + // Check if result has extraction property, otherwise return the whole result + return result && typeof result === "object" && "extraction" in result + ? result.extraction + : result; + } - if (method === "close") { - return; - } + case "OBSERVE": + this.logger({ + category: "operator", + message: `Observing: ${step.instruction || "(no instruction)"}`, + level: 2, + }); + return await page.observe({ instruction: step.instruction }); - switch (method) { - case "act": - if (!playwrightArguments) { - throw new StagehandMissingArgumentError( - "No arguments provided to `act()`. " + - "Please ensure that all required arguments are passed in.", - ); - } - await page.act(playwrightArguments); - break; - case "extract": - if (!extractionResult) { - throw new StagehandError( - "Error in OperatorHandler: Cannot complete extraction. No extractionResult provided.", - ); + case "SCREENSHOT": { + this.logger({ + category: "operator", + message: `Taking screenshot`, + level: 2, + }); + const cdpSession = await page.context().newCDPSession(page); + try { + const { data } = await cdpSession.send("Page.captureScreenshot", { + format: "png", + }); // Specify format like png + return `data:image/png;base64,${data}`; // Return base64 data URL + } finally { + await cdpSession.detach(); // Ensure session is detached } - return extractionResult; - case "goto": - await page.goto(parameters, { waitUntil: "load" }); - break; - case "wait": - await page.waitForTimeout(parseInt(parameters)); - break; - case "navback": + } + + case "WAIT": + this.logger({ + category: "operator", + message: `Waiting for ${step.instruction}ms`, + level: 2, + }); + await page.waitForTimeout(Number(step.instruction)); + return null; + + case "NAVBACK": + this.logger({ + category: "operator", + message: `Navigating back`, + level: 2, + }); await page.goBack(); - break; - case "refresh": - await page.reload(); - break; + await this.stagehandPage._waitForSettledDom(); + return null; + + case "GET_URL": + this.logger({ + category: "operator", + message: `Getting current URL`, + level: 2, + }); + return await page.url(); + + case "DONE": + case "FAIL": + // These are terminal states, handled by the main loop based on the step received + // No browser action needed here, but we return the status info + this.logger({ + category: "operator", + message: `Step is terminal: ${step.tool}`, + level: 1, + }); + return { status: step.tool, message: step.instruction }; + + // CLOSE is deprecated and converted to DONE by _generateNextStepInstruction + // case "CLOSE": + default: + // Use StagehandError for consistency throw new StagehandError( - `Error in OperatorHandler: Cannot execute unknown action: ${method}`, + `[OperatorHandler] _performBrowserAction: Unimplemented or unknown tool ${step.tool}`, ); } } + + private _isRepeatingAction( + step: BrowserStep, + history: Array<{ tool: string; instruction: string }>, + ): boolean { + // TODO: Implement logic from the provided code + console.log("[OperatorHandler] TODO: Implement _isRepeatingAction logic"); + const MAX_HISTORY = 5; // Number of actions to track + const MAX_DUPLICATES = 2; // Maximum number of times the same action can be repeated + + const duplicate = history.filter( + (h) => h.tool === step.tool && h.instruction === step.instruction, + ).length; + + // Add current action to history (mutable operation, careful if history is shared) + history.push({ + tool: step.tool, + instruction: step.instruction, + }); + + // Keep history at MAX_HISTORY size + if (history.length > MAX_HISTORY) { + history.shift(); + } + + return duplicate >= MAX_DUPLICATES; + } + + private _hasPossibleLoop(steps: BrowserStep[]): boolean { + // TODO: Implement logic from the provided code + console.log("[OperatorHandler] TODO: Implement _hasPossibleLoop logic"); + if (steps.length < 3) return false; + + const recentSteps = steps.slice(-3); + const allSameTool = recentSteps.every( + (s) => s.tool === recentSteps[0].tool, + ); + const uniqueInstructions = new Set(recentSteps.map((s) => s.instruction)); + const hasRepeatedInstructions = + uniqueInstructions.size < recentSteps.length; + const stuckPhrases = [ + "still", + "again", + "retry", + "same", + "another attempt", + "try once more", + ]; + const containsStuckPhrases = recentSteps.some((s) => + stuckPhrases.some( + (phrase) => + s.text.toLowerCase().includes(phrase) || + s.reasoning.toLowerCase().includes(phrase), + ), + ); + + return (allSameTool && hasRepeatedInstructions) || containsStuckPhrases; + } + + private async _generateNextStepInstruction(params: { + subtaskId: string; + overallGoal: string; + subtaskGoal: string; + subtaskDescription: string; + taskPlanContext: any; // Define specific type later + previousSteps: BrowserStep[]; + currentUrl: string; + previousExtraction: any; + screenshot: string | null; + currentPageText: string | null; + }): Promise { + const { + subtaskId, + overallGoal, + subtaskGoal, + subtaskDescription, + taskPlanContext, // Assuming this might contain planDescription, subtaskPosition, totalSubtasks, otherSubtasks + previousSteps, + currentUrl, + previousExtraction, + screenshot, + currentPageText, + } = params; + + this.logger({ + category: "operator", + message: `[Subtask ${subtaskId}] Generating next step instruction.`, + level: 2, + }); + + // Define the schema for the LLM response + const browserStepSchema = z.object({ + text: z + .string() + .describe("A concise description of what action to take next"), + reasoning: z + .string() + .describe( + "Your reasoning for choosing this action, referring specifically to what you observe in the screenshot and how it relates to the overall task", + ), + tool: z + .enum([ + // "GOTO", + "ACT", + "EXTRACT", + "OBSERVE", + "CLOSE", + "WAIT", + "NAVBACK", + "SCREENSHOT", + "GET_URL", + "DONE", + "FAIL", + ]) + .describe( + "The tool to use for this step (CLOSE is deprecated, use DONE)", + ), + instruction: z + .string() + .describe("The specific instruction for the selected tool"), + }); + + // Construct the text prompt dynamically + let textPrompt = ` +OVERALL TASK GOAL: ${overallGoal} +`; + // Add Task Plan context if available (adapt based on actual structure of taskPlanContext) + if (taskPlanContext) { + if (taskPlanContext.planDescription) { + textPrompt += `PLAN DESCRIPTION: ${taskPlanContext.planDescription}\n`; + } + textPrompt += `YOUR SUBTASK GOAL: ${subtaskGoal}\n`; + textPrompt += `SUBTASK DESCRIPTION: ${subtaskDescription}\n`; + if (taskPlanContext.subtaskPosition) { + textPrompt += `YOUR SUBTASK POSITION: ${taskPlanContext.subtaskPosition} of ${taskPlanContext.totalSubtasks || "?"}\n`; + } + // TODO: Add info about other subtasks if needed/available in taskPlanContext + textPrompt += `\nHOW THIS SUBTASK FITS INTO THE OVERALL PLAN: +This subtask is one part of achieving the overall goal. Your work will contribute to the larger task. +`; + } + + if (previousSteps.length > 0) { + textPrompt += `\nPREVIOUS STEPS YOU\'VE TAKEN: +${previousSteps.map((step, i) => `Step ${i + 1}: ${step.text}\nTool: ${step.tool}\nInstruction: ${step.instruction}\nReasoning: ${step.reasoning}`).join("\n\n")}\n`; + } + if (previousExtraction) { + textPrompt += `\nPREVIOUS EXTRACTION: +${JSON.stringify(previousExtraction, null, 2)}\n`; + } + textPrompt += `\nCURRENT URL: ${currentUrl}\n`; + + // Add textual page content if available + if (currentPageText) { + textPrompt += `\nCURRENT PAGE TEXT CONTENT (extracted). This is a DOM+Accessibility tree hybrid representation of the page: +------- +${currentPageText} +-------\n`; // Limit length to avoid excessive tokens + } + + // Add loop warning if needed + if (this._hasPossibleLoop(previousSteps)) { + textPrompt += `\nWARNING: You appear to be repeating similar actions without making progress. Try a completely different approach to achieve your goal. Consider: +1. Using a different tool (e.g., ACT instead of OBSERVE) +2. Looking at different parts of the page in the screenshot +3. Trying a different interaction method (e.g., different selector or action) +4. Navigating to a different page if options are exhausted here\n`; + } + + textPrompt += ` +Determine the next single step to achieve the subtask goal. Carefully analyze the provided screenshot AND the textual page content. +Respond ONLY with the JSON object matching the required schema.`; + + try { + const messages: CoreMessage[] = [ + { role: "system", content: WORKER_PROMPT }, + { + role: "user", + content: [ + { type: "text", text: textPrompt }, + // Add screenshot if available + ...(screenshot + ? [{ type: "image" as const, image: screenshot }] + : []), + ], + }, + ]; + + const result = await this.llmClient.generateObject({ + model: WorkerLLM, // Use the defined WorkerLLM + messages, + schema: browserStepSchema, + }); + + let nextStep: BrowserStep = result.object as BrowserStep; + + // If the LLM used the deprecated CLOSE tool, convert it to DONE + if (nextStep.tool === "CLOSE") { + this.logger({ + category: "operator", + message: `[Subtask ${subtaskId}] LLM used deprecated CLOSE tool, converting to DONE.`, + level: 1, + }); + nextStep = { + ...nextStep, + tool: "DONE", + text: nextStep.text.replace("Closing", "Completing"), + instruction: `Subtask assumed complete based on CLOSE attempt: ${nextStep.instruction || nextStep.text}`, + }; + } + + // Check for implicit completion signals even if DONE tool wasn't used + if ( + nextStep.tool !== "DONE" && + nextStep.tool !== "FAIL" && // Don't override explicit FAIL + (nextStep.text.toLowerCase().includes("task complete") || + nextStep.text.toLowerCase().includes("goal achieved") || + nextStep.text.toLowerCase().includes("subtask complete") || + nextStep.reasoning.toLowerCase().includes("task complete") || + nextStep.reasoning.toLowerCase().includes("goal achieved") || + nextStep.reasoning.toLowerCase().includes("subtask complete")) + ) { + this.logger({ + category: "operator", + message: `[Subtask ${subtaskId}] Detected completion language but no DONE tool, converting step to DONE.`, + level: 1, + }); + return { + ...nextStep, // Keep original reasoning/text for context + tool: "DONE" as const, + instruction: `Subtask implicitly completed: ${nextStep.instruction || nextStep.text}`, + }; + } + + return nextStep; + } catch (error) { + this.logger({ + category: "operator", + message: `[Subtask ${subtaskId}] Error generating next step instruction: ${error}`, + level: 0, + }); + // Fallback to a safe action if generation fails + return { + text: "Failed to determine next step, taking a screenshot to reassess", + reasoning: `Error occurred in step generation: ${error instanceof Error ? error.message : String(error)}. Capturing current state to recover.`, + tool: "SCREENSHOT", + instruction: "", + }; + } + } } diff --git a/lib/index.ts b/lib/index.ts index 2976674ce..48596f141 100644 --- a/lib/index.ts +++ b/lib/index.ts @@ -827,6 +827,8 @@ export class Stagehand { this.stagehandPage, this.logger, this.llmClient, + this.llmProvider, + this.modelName, ).execute(instructionOrOptions); }, }; diff --git a/lib/llm/LLMProvider.ts b/lib/llm/LLMProvider.ts index fc11c5753..a5a01bd9d 100644 --- a/lib/llm/LLMProvider.ts +++ b/lib/llm/LLMProvider.ts @@ -93,7 +93,7 @@ const modelToProviderMap: { [key in AvailableModel]: ModelProvider } = { "gemini-2.5-pro-preview-03-25": "google", }; -function getAISDKLanguageModel( +export function getAISDKLanguageModel( subProvider: string, subModelName: string, apiKey?: string, diff --git a/lib/package.json b/lib/package.json index cfd9b4f38..284726301 100644 --- a/lib/package.json +++ b/lib/package.json @@ -3,9 +3,9 @@ "version": "2.2.1", "private": true, "description": "Core Stagehand library sources", - "main": "../dist/index.js", - "module": "../dist/index.js", - "types": "../dist/index.d.ts", + "main": "./index.ts", + "module": "./dist/index.ts", + "types": "./dist/index.d.ts", "scripts": { "build-dom-scripts": "tsx dom/genDomScripts.ts", "build-js": "tsup index.ts --dts", diff --git a/lib/prompt.ts b/lib/prompt.ts index 2183ef967..037fbfcb5 100644 --- a/lib/prompt.ts +++ b/lib/prompt.ts @@ -1,5 +1,4 @@ import { ChatMessage } from "./llm/LLMClient"; - export function buildUserInstructionsString( userProvidedInstructions?: string, ): string { @@ -170,23 +169,92 @@ export function buildActObservePrompt( return instruction; } -export function buildOperatorSystemPrompt(goal: string): ChatMessage { - return { - role: "system", - content: `You are a general-purpose agent whose job is to accomplish the user's goal across multiple model calls by running actions on the page. +export const PLANNER_PROMPT = ` +You are a Task Planning Agent responsible for breaking down user goals into clear, executable subtasks for web automation workers. Your job is to create a detailed plan with specific subtasks that web automation workers can execute. -You will be given a goal and a list of steps that have been taken so far. Your job is to determine if either the user's goal has been completed or if there are still steps that need to be taken. +Each worker will: +1. Have a single subtask goal to accomplish +2. Use a "best next step" approach to complete their subtask +3. Be limited to using these tools: ACT, EXTRACT, OBSERVE, SCREENSHOT, WAIT, or NAVBACK +4. Retry up to 3 times before reporting failure +5. Report either DONE or FAIL status upon completion -# Your current goal -${goal} +When creating a plan: +1. Break the goal into logical, sequential subtasks +2. Ensure each subtask is focused and achievable +3. Specify a clear goal for each subtask +4. Consider dependencies between subtasks +5. Provide enough context for each worker to understand their role -# Important guidelines -1. Break down complex actions into individual atomic steps -2. For \`act\` commands, use only one action at a time, such as: - - Single click on a specific element - - Type into a single input field - - Select a single option -3. Avoid combining multiple actions in one instruction -4. If multiple actions are needed, they should be separate steps`, - }; -} +For example, for a task like "Check the price of NVIDIA stock": +- Subtask 1: Navigate to a financial website (Goal: Find and open a reliable financial information source) +- Subtask 2: Search for NVIDIA stock (Goal: Locate the NVIDIA stock page) +- Subtask 3: Extract the current stock price (Goal: Find and extract the current price of NVIDIA stock) +- Subtask 4: Extract any additional relevant information (Goal: Find important metrics like daily change, market cap, etc.) + +DO NOT include specific website instructions or action sequences. Focus on WHAT to accomplish, not HOW. +`; + +export const WORKER_PROMPT = ` +You are a Web Automation Worker responsible for completing a specific subtask that contributes to a larger goal. Your job is to determine the immediate next best action to take at each step to accomplish your specific subtask goal. + +Remember that your subtask is part of a broader plan. Even with vague instructions, you should: +- Consider how your work contributes to the overall goal +- Adapt your approach based on what you observe +- Make intelligent decisions if the original plan needs adjustment + +You will use a "best next step" approach: +1. CAREFULLY ANALYZE the current state of the webpage through the screenshot provided +2. REFLECT on how your subtask contributes to the overall goal +3. Decide the single most appropriate next action +4. Execute that action using one of these tools: + - ACT: Perform an action like clicking, typing, etc. + - SCREENSHOT: Take a screenshot of the current page + - WAIT: Wait for a specific condition or time + - NAVBACK: Navigate back to the most immediateprevious page, if you're unsure if you would like to go back a few steps, call the GOTO tool + - GET_URL: Get the current page URL (simpler than EXTRACT) + - EXTRACT: Extract data from the page using JavaScript + - DONE: Mark the subtask as successfully completed + - FAIL: Mark the subtask as failed due to unresolvable issues + +Tool Guidelines: +- ACT: Use for clicking elements, typing text, selecting options, etc. Be specific about the target element. If clicking failed, try selecting, or the other way around. For the most part you should be clicking and filling. +- SCREENSHOT: Use when you need a fresh view of the page or after a significant change. +- WAIT: Use when you need to wait for an element to appear or for a page to load. +- NAVBACK: Use when you need to go back to a previous page. +- GET_URL: A simple way to get the current URL without using JavaScript. Use this instead of EXTRACT when you just need the URL. +- EXTRACT: Use for extracting data using JavaScript when GET_URL is not sufficient. +- DONE: Use ONLY when the subtask is 100% complete. Provide a clear message explaining what was accomplished. +- FAIL: Use when you've encountered an error that cannot be resolved after multiple attempts. Provide details about the failure. + +IMPORTANT VISUAL AWARENESS: +- ALWAYS carefully study the screenshot before deciding your next action +- The screenshot is your primary source of information about the page +- Look at the entire page to identify elements, buttons, forms, and text +- Pay special attention to error messages, popup notifications, or loading indicators +- If you see a CAPTCHA or security challenge, report it immediately +- Don't repeat the same action if it's not working - try a different approach + +Guidelines for Self-Healing: +1. Break down complex actions into single atomic steps (one click, one text input) +2. Focus on completing your subtask while understanding its role in the overall task +3. Take actions that directly contribute to your goal +4. If you encounter errors or obstacles: + - Try alternative approaches that might achieve the same outcome + - Consider if a different path would better serve the overall goal + - If the exact subtask can't be completed, achieve as much as possible +5. After 3 failed attempts, use the FAIL tool with a detailed explanation +6. When the subtask is completed, use the DONE tool with a clear success message +7. DO NOT get stuck in loops - if you find yourself repeating the same action, try something completely different + +You will be provided with: +- A screenshot of the current webpage (updated after every action) +- The overall goal of the task +- Your specific subtask and its goal +- Context about how your subtask fits into the larger plan +- Any previous steps you've taken +- Results of any previous extractions + +Remember: Visual confirmation through the screenshot is your most reliable guide for making decisions! +NEVER CALL THE GOTO TOOL. EVER. +`; diff --git a/types/operator.ts b/types/operator.ts index 08ee870f3..8f6d77732 100644 --- a/types/operator.ts +++ b/types/operator.ts @@ -1,3 +1,4 @@ +// TODO: Remove entirely, change to agent type. import { z } from "zod"; export const operatorResponseSchema = z.object({ @@ -6,34 +7,22 @@ export const operatorResponseSchema = z.object({ .describe( "The reasoning for the step taken. If this step's method is `close`, the goal was to extract data, and the task was successful, state the data that was extracted.", ), - method: z.enum([ - "act", - "extract", - "goto", - "close", - "wait", - "navback", - "refresh", - ]) + method: z.enum(["act", "extract", "goto", "wait", "navback", "refresh"]) .describe(`The action to perform on the page based off of the goal and the current state of the page. goto: Navigate to a specific URL. act: Perform an action on the page. extract: Extract data from the page. - close: The task is complete, close the browser. wait: Wait for a period of time. navback: Navigate back to the previous page. Do not navigate back if you are already on the first page. refresh: Refresh the page.`), - parameters: z - .string() - .describe( - `The parameter for the action. Only pass in a parameter for the following methods: - - act: The action to perform. e.g. "click on the submit button" or "type [email] into the email input field and press enter" + parameters: z.string().describe( + `The parameter for the action. Only pass in a parameter for the following methods: + - act: The action to perform. e.g. "click on the submit button" or "type [email] into the email input field and press enter". NEVER CALL ACT WITHOUT AN INSTRUCTION. - extract: The data to extract. e.g. "the title of the article". If you want to extract all of the text on the page, leave this undefined. - wait: The amount of time to wait in milliseconds. - goto: The URL to navigate to. e.g. "https://www.google.com" - The other methods do not require a parameter.`, - ) - .optional(), + The other methods do not require a parameter. Pass in an empty string for the parameter.`, + ), taskComplete: z .boolean() .describe(