diff --git a/.gitignore b/.gitignore index 9e939e1..ca55236 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,6 @@ yarn-error.log* # typescript *.tsbuildinfo next-env.d.ts + +# logs +logs/ \ No newline at end of file diff --git a/README.md b/README.md index 3415885..a5278c4 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ -# Testing Agent Demo +# Testing Agent Demo - Azure OpenAI Edition [![MIT License](https://img.shields.io/badge/License-MIT-green.svg)](frontend/LICENSE) -This monorepo demonstrates how you can use OpenAI's CUA model and [computer use tool](https://platform.openai.com/docs/guides/tools-computer-use) to automate frontend testing. It uses [Playwright](https://playwright.dev) to spin up a browser instance and navigate to the web app to be tested. The CUA model then follows the provided test case and executes actions on the interface until the test case is done. +This monorepo demonstrates how you can use Azure OpenAI's computer-use-preview model and [computer use tool](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/computer-use) to automate frontend testing. It uses [Playwright](https://playwright.dev) to spin up a browser instance and navigate to the web app to be tested. The Azure OpenAI computer-use-preview model then follows the provided test case and executes actions on the interface until the test case is done. The repo contains three applications that work together: - **frontend** – Next.js web interface used to configure tests and watch them run. -- **cua-server** – Node service that communicates with the OpenAI CUA model and drives Playwright to interact in a browser with the sample app. +- **cua-server** – Node service that communicates with the Azure OpenAI computer-use-preview model and drives Playwright to interact in a browser with the sample app. - **sample-test-app** – Example e‑commerce site used as an example app to test by the agent. ![screenshot](./screenshot.jpg) @@ -24,9 +24,16 @@ The repo contains three applications that work together: cd openai-testing-agent-demo ``` -2. **Prepare environment files** +2. **Set up Azure OpenAI** - If you haven't set your `OPENAI_API_KEY` environment variable on your terminal or globally on your machine (set up instructions [here](https://platform.openai.com/docs/libraries#create-and-export-an-api-key)), edit each `.env.development` file and set `OPENAI_API_KEY`. + - Create an Azure OpenAI resource in the Azure portal + - Deploy the `computer-use-preview` model for computer use capabilities + - Deploy a chat model like `gpt-4o` for test case generation + - Get your API key and endpoint from the Azure portal + +3. **Prepare environment files** + + Copy the example environment files and configure with your Azure OpenAI settings: ```bash cp frontend/.env.example frontend/.env.development @@ -43,14 +50,14 @@ The repo contains three applications that work together: Make sure you add a `sample-test-app/.env.development` file with the example credentials to run the demo. -3. **Install dependencies** +4. **Install dependencies** ```bash npm install npx playwright install ``` -4. **Run all apps** +5. **Run all apps** ```bash npm run dev @@ -84,6 +91,7 @@ You are welcome to open issues or submit PRs to improve this app, however, pleas - This project is meant to be used on test environments only. - Do not use real user data in production. +- Ensure your Azure OpenAI API keys are kept secure and not committed to version control. ## License diff --git a/cua-server/.env.example b/cua-server/.env.example index f2b9c19..7d8874c 100644 --- a/cua-server/.env.example +++ b/cua-server/.env.example @@ -1 +1,32 @@ -OPENAI_API_KEY=your-openai-key +# PROVIDER CONFIGURATION +USE_OPENAI=false + +# OpenAI Configuration +OPENAI_API_KEY=your_openai_api_key_here + +# OpenAI Model Configuration +OPENAI_COMPUTER_USE_MODEL=computer-use-preview +OPENAI_TEST_CASE_AGENT=o3-mini +OPENAI_TEST_SCRIPT_REVIEW_AGENT=gpt-4o + +# Azure-OpenAI Configuration +AZURE_API_KEY=your_azure_openai_api_key_here +AZURE_ENDPOINT=your_azure_openai_api_endpoint +AZURE_API_VERSION=2025-03-01-preview + +# Azure-OpenAI Model Deployment Names +AZURE_COMPUTER_USE_MODEL_DEPLOYMENT_NAME=computer-use-preview +AZURE_TEST_CASE_AGENT_DEPLOYMENT_NAME=o3-mini +AZURE_TEST_SCRIPT_REVIEW_AGENT_DEPLOYMENT_NAME=gpt-4o-2 + +# Display Configuration +DISPLAY_WIDTH=1024 +DISPLAY_HEIGHT=768 + +# Server Configuration +SOCKET_PORT=8000 +CORS_ORIGIN=* + +# Logging +LOG_LEVEL=debug +NODE_ENV=development \ No newline at end of file diff --git a/cua-server/README.md b/cua-server/README.md index c7bbe02..0f9747e 100644 --- a/cua-server/README.md +++ b/cua-server/README.md @@ -1,26 +1,141 @@ -# CUA Server +# CUA Server - Azure OpenAI Edition -![OpenAI API](https://img.shields.io/badge/Powered_by-OpenAI_API-orange) +This is the core testing agent server that communicates with Azure OpenAI's computer-use-preview model to drive Playwright automation. -A Node.js service that interfaces with the OpenAI CUA model and exposes a Socket.IO WebSocket API used by the frontend. +## Prerequisites -## Setup +- Azure OpenAI resource with access to the computer-use-preview model +- Node.js and npm +- Valid Azure OpenAI API key and endpoint -1. Copy the example environment file and add your OpenAI key: +## Azure OpenAI Setup + +1. **Create Azure OpenAI Resource**: Set up an Azure OpenAI resource in your Azure portal +2. **Deploy Models**: Deploy the following models in your Azure OpenAI resource: + - `computer-use-preview` (for computer use capabilities) + - `gpt-4o` (for test case generation and review) +3. **Get API Key and Endpoint**: Obtain your API key and endpoint from the Azure portal + +## Configuration + +1. **Copy environment files**: ```bash cp .env.example .env.development # edit .env.development ``` -2. Install dependencies and launch the server: + +2. **Configure Azure OpenAI settings** in `.env.development`: + ```bash + # Your Azure OpenAI API key + AZURE_API_KEY=your_azure_openai_api_key_here + + # Your Azure OpenAI endpoint + AZURE_ENDPOINT=https://your-resource-name.openai.azure.com + + # API version (use the latest preview version) + AZURE_API_VERSION=2025-04-01-preview + + # Deployment names from your Azure OpenAI resource + AZURE_DEPLOYMENT_NAME=computer-use-preview + AZURE_DEPLOYMENT_NAME_CHAT=gpt-4o + ``` + +3. **Optional configurations**: + ```bash + # Display dimensions for the browser automation + DISPLAY_WIDTH=1024 + DISPLAY_HEIGHT=768 + + # Server configuration + SOCKET_PORT=8000 + CORS_ORIGIN=* + + # Logging level + LOG_LEVEL=info + + # Environment-specific instructions (e.g., for macOS) + ENV_SPECIFIC_INSTRUCTIONS=Use CMD key instead of CTRL key for macOS + ``` + +## Usage + +1. **Install dependencies**: ```bash npm install - npx playwright install - npm run dev # or npm start ``` - The server listens on port `8000` by default. Set `SOCKET_PORT` to change it. -### Environment Variables +2. **Run in development mode**: + ```bash + npm run dev + ``` + +3. **Build and run**: + ```bash + npm run build + npm start + ``` + +## Azure OpenAI API Compatibility + +This server now uses Azure OpenAI API instead of the standard OpenAI API. The key differences: + +- Uses `AzureOpenAI` client instead of `OpenAI` +- Requires endpoint and API version configuration +- Uses deployment names instead of model names +- Supports the same computer-use-preview functionality via Azure + +## API Example + +The server makes requests similar to this curl example: +```bash +curl -X POST "https://your-resource-name.openai.azure.com/openai/responses?api-version=2025-04-01-preview" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $AZURE_API_KEY" \ + -d '{ + "model": "computer-use-preview", + "input": [...] + }' +``` + +## Environment Variables Reference + +| Variable | Description | Required | Default | +|----------|-------------|----------|---------| +| `AZURE_API_KEY` | Your Azure OpenAI API key | Yes | - | +| `AZURE_ENDPOINT` | Your Azure OpenAI endpoint URL | Yes | - | +| `AZURE_API_VERSION` | Azure OpenAI API version | No | `2025-04-01-preview` | +| `AZURE_DEPLOYMENT_NAME` | Computer-use model deployment name | No | `computer-use-preview` | +| `AZURE_DEPLOYMENT_NAME_CHAT` | Chat model deployment name | No | `gpt-4o` | +| `DISPLAY_WIDTH` | Browser viewport width | No | `1024` | +| `DISPLAY_HEIGHT` | Browser viewport height | No | `768` | +| `SOCKET_PORT` | WebSocket server port | No | `8000` | +| `CORS_ORIGIN` | CORS origin setting | No | `*` | +| `LOG_LEVEL` | Logging level | No | `info` | +| `ENV_SPECIFIC_INSTRUCTIONS` | OS-specific instructions | No | - | + +## Architecture + +The server includes several key components: + +- **OpenAI CUA Client** (`services/openai-cua-client.ts`): Handles Azure OpenAI API communication +- **Test Case Agent** (`agents/test-case-agent.ts`): Generates test case steps +- **Test Script Review Agent** (`agents/test-script-review-agent.ts`): Reviews and updates test progress +- **Computer Use Loop** (`lib/computer-use-loop.ts`): Main automation loop +- **Handlers**: WebSocket and browser automation handlers + +## Troubleshooting + +1. **Authentication Error**: Verify your `AZURE_API_KEY` and `AZURE_ENDPOINT` are correct +2. **Model Not Found**: Ensure your deployment names match what's configured in Azure +3. **API Version Error**: Try using a different `AZURE_API_VERSION` if the preview version is not available +4. **Rate Limiting**: Azure OpenAI has rate limits that may be different from standard OpenAI + +## Migration from OpenAI + +This project has been updated from standard OpenAI to Azure OpenAI. Key changes: -- `OPENAI_API_KEY` – required for calls to the CUA model. -- `SOCKET_PORT` (optional) – WebSocket port (default `8000`). -- `CORS_ORIGIN` (optional) – allowed CORS origin for incoming connections. +1. `OpenAI` client replaced with `AzureOpenAI` +2. `OPENAI_API_KEY` replaced with `AZURE_API_KEY` +3. Added `AZURE_ENDPOINT` and `AZURE_API_VERSION` requirements +4. Model names replaced with deployment names +5. All API calls now go through Azure OpenAI endpoints diff --git a/cua-server/src/agents/test-case-agent.ts b/cua-server/src/agents/test-case-agent.ts index 3d7eced..5dc6f5f 100644 --- a/cua-server/src/agents/test-case-agent.ts +++ b/cua-server/src/agents/test-case-agent.ts @@ -1,53 +1,51 @@ import { PROMPT_WITHOUT_LOGIN, PROMPT_WITH_LOGIN } from "../lib/constants"; import logger from "../utils/logger"; -import OpenAI from "openai"; -import { z } from "zod"; -import { zodTextFormat } from "openai/helpers/zod"; - -export const TestCaseStepSchema = z.object({ - step_number: z.number(), - step_instructions: z.string(), - status: z.string().nullable(), -}); - -export const TestCaseSchema = z.object({ - steps: z.array(TestCaseStepSchema), -}); - -export type TestCase = z.infer; - -const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); +import { openai_service } from "../services/openai-service"; +import { TestCase, TEST_CASE_JSON_SCHEMA } from "../utils/test-case-utils"; class TestCaseAgent { - private readonly model = "o3-mini"; - private readonly developer_prompt: string; + private readonly model: string; + private readonly system_prompt: string; private readonly login_required: boolean; constructor(login_required = false) { this.login_required = login_required; - this.developer_prompt = login_required - ? PROMPT_WITH_LOGIN - : PROMPT_WITHOUT_LOGIN; - logger.trace(`Developer prompt: ${this.developer_prompt}`); + this.system_prompt = login_required ? PROMPT_WITH_LOGIN : PROMPT_WITHOUT_LOGIN; + + // Use different model names based on provider + if (process.env.USE_OPENAI === 'true') { + this.model = process.env.OPENAI_TEST_CASE_AGENT || "o3-mini"; + } else { + this.model = process.env.AZURE_TEST_CASE_AGENT_DEPLOYMENT_NAME || "o3-mini"; + } } /** - * Generate structured test steps via the Responses API. + * Generate test steps via the unified Response API. */ - async invokeResponseAPI(userInstruction: string): Promise { - logger.debug("Invoking Response API", { userInstruction }); - const response = await openai.responses.parse({ + async generateTestCases(userInstruction: string): Promise { + logger.info("Generating Test Cases"); + + const response = await openai_service.responseAPI({ + systemPrompt: this.system_prompt, + userMessage: userInstruction, model: this.model, - input: [ - { role: "system", content: this.developer_prompt }, - { role: "user", content: userInstruction }, - ], - text: { - format: zodTextFormat(TestCaseSchema, "test_case"), - }, + schema: TEST_CASE_JSON_SCHEMA, + schemaName: "test_case" }); - logger.debug("Response API output", { output: response.output_parsed }); - return response.output_parsed!; + + if (!response.output_text) { + throw new Error("No output text received from OpenAI service"); + } + + const result: TestCase = JSON.parse(response.output_text); + + logger.info(`Test Cases Generated Successfully:\n${JSON.stringify({ + loginRequired: this.login_required, + steps: result.steps + }, null, 2)}`); + + return result; } } diff --git a/cua-server/src/agents/test-script-review-agent.ts b/cua-server/src/agents/test-script-review-agent.ts index dbdd33d..69eaa53 100644 --- a/cua-server/src/agents/test-script-review-agent.ts +++ b/cua-server/src/agents/test-script-review-agent.ts @@ -3,151 +3,84 @@ * Each call to checkTestScriptStatus enqueues a new screenshot processing job. */ import logger from "../utils/logger"; -import OpenAI from "openai"; import fs from "fs"; import path from "path"; import { v4 as uuidv4 } from "uuid"; import { TEST_SCRIPT_REVIEW_PROMPT } from "../lib/constants"; +import { openai_service } from "../services/openai-service"; +import { ScreenshotUtils } from "../utils/screenshot-utils"; +import { TestScriptState, getStepsWithStatusChange, updateStepImagePaths, TEST_SCRIPT_AGENT_JSON_SCHEMA } from "../utils/test-script-utils"; -const openai = new OpenAI(); - -interface TestScriptState { - steps: Array<{ - step_number: number; - status: string; - step_reasoning: string; - image_path?: string; - }>; -} - -interface Task { +interface ReviewTask { base64Image: string; userInstruction?: string; - resolve: (value: any) => void; - reject: (error: any) => void; + resolve: (value: string) => void; + reject: (error: Error) => void; } class TestScriptReviewAgent { - model: string; - previous_response_id: string | null; - test_script_state: TestScriptState | null; - runFolder: string | null; + private readonly model: string; + private previous_response_id: string | null = null; + private test_script_state: TestScriptState | null = null; + private screenshotUtils: ScreenshotUtils | null = null; // Flag whether to include the previous screenshot response in the input to the LLM - true works best - includePreviousResponse: boolean = true; + private readonly includePreviousResponse: boolean = true; // Task queue related properties - private taskQueue: Task[] = []; - private processingQueue: boolean = false; + private readonly taskQueue: ReviewTask[] = []; + private isProcessingQueue: boolean = false; constructor() { - // Set the default model to "gpt-4o" - this.model = "gpt-4o"; - - // Maintain the previous response id. - this.previous_response_id = null; - - // Save the current state of the test script. Initially null. - this.test_script_state = null; - - // Initialize runFolder as null; will be set on each new run - this.runFolder = null; + // Use different model names based on provider + if (process.env.USE_OPENAI === 'true') { + this.model = process.env.OPENAI_TEST_SCRIPT_REVIEW_AGENT || "gpt-4o"; + } else { + this.model = process.env.AZURE_TEST_SCRIPT_REVIEW_AGENT_DEPLOYMENT_NAME || 'gpt-4o'; + } + logger.debug(`TestScriptReviewAgent Initialized with model: ${this.model}`); } /** * Creates the initial test script state from the user instructions. */ - async instantiateAgent(userInstruction: string): Promise { - logger.debug( - `Invoking Chat API (instantiateAgent) with instruction: ${userInstruction}` - ); - logger.debug( - `Instantiation agent - This should only be called once per test script run.` - ); - - const response = await openai.responses.create({ - model: this.model, - input: [ - { role: "system", content: TEST_SCRIPT_REVIEW_PROMPT }, - { - role: "user", - content: [ - { type: "input_text", text: "Instructions: " + userInstruction }, - ], - }, - ], - text: { - format: { - type: "json_schema", - name: "test_script_output", - schema: { - type: "object", - properties: { - steps: { - type: "array", - items: { - type: "object", - properties: { - step_number: { type: "number" }, - status: { - type: "string", - enum: ["pending", "Pass", "Fail"], - }, - step_reasoning: { type: "string" }, - }, - required: ["step_number", "status", "step_reasoning"], - additionalProperties: false, - }, - }, - }, - required: ["steps"], - additionalProperties: false, - }, - }, - }, - }); - - logger.debug( - `Response from instantiateAgent: ${JSON.stringify( - response.output_text, - null, - 2 - )}` - ); - - this.previous_response_id = response.id; + async instantiateAgent(userInstruction: string): Promise { + logger.trace(`Instantiation agent with instruction (This should only be called once per test script run):\n${userInstruction}`); + + try { + const response = await openai_service.responseAPI({ + systemPrompt: TEST_SCRIPT_REVIEW_PROMPT, + userMessage: "Instructions: " + userInstruction, + model: this.model, + schema: TEST_SCRIPT_AGENT_JSON_SCHEMA, + schemaName: "test_script_output" + }); - // Parse the returned JSON once, store it as an object - const parsedState: TestScriptState = JSON.parse(response.output_text); - this.test_script_state = parsedState; + logger.info(`Agent Instantiation Successful: ${response.id}`); - // Create a unique folder for this run and store its name in runFolder - this.runFolder = uuidv4(); - const runFolderPath = path.join( - process.cwd(), - "..", - "frontend", - "public", - "test_results", - this.runFolder - ); - if (!fs.existsSync(runFolderPath)) { - fs.mkdirSync(runFolderPath, { recursive: true }); - logger.debug(`Run folder created: ${runFolderPath}`); + this.previous_response_id = response.id; + this.test_script_state = JSON.parse(response.output_text!) as TestScriptState; + + // Create screenshot utils instance for this session + this.screenshotUtils = new ScreenshotUtils(); + await this.screenshotUtils.ensureRunFolder(); + + return response.output_text!; + } catch (error) { + logger.error(`Failed to Instantiate Agent:\n${JSON.stringify({ + error: error instanceof Error ? error.message : error + }, null, 2)}`); + throw new Error(`Failed to instantiate agent: ${error instanceof Error ? error.message : 'Unknown error'}`); } - - return response.output_text; // Return the raw JSON string for now } /** * Enqueues a new test script review task. */ - async checkTestScriptStatus( - base64Image: string, - userInstruction?: string - ): Promise { + async checkTestScriptStatus(base64Image: string, userInstruction?: string): Promise { + logger.debug("Enqueuing test script review task"); + return new Promise((resolve, reject) => { - // Enqueue the new task. this.taskQueue.push({ base64Image, userInstruction, resolve, reject }); this.processQueue(); }); @@ -156,198 +89,100 @@ class TestScriptReviewAgent { /** * Processes the task queue sequentially. */ - private async processQueue() { - if (this.processingQueue) return; - this.processingQueue = true; - - while (this.taskQueue.length > 0) { - const { base64Image, userInstruction, resolve, reject } = - this.taskQueue.shift()!; - try { - const result = await this.processTestScriptStatus( - base64Image, - userInstruction - ); - resolve(result); - } catch (error) { - reject(error); + private async processQueue(): Promise { + if (this.isProcessingQueue) { + logger.debug("Queue processing already in progress"); + return; + } + + this.isProcessingQueue = true; + logger.debug("Starting Queue Processing"); + + try { + while (this.taskQueue.length > 0) { + const task = this.taskQueue.shift()!; + try { + const result = await this.processTestScriptReview(task.base64Image, task.userInstruction); + task.resolve(result); + } catch (error) { + logger.error(`Task processing failed:\n${JSON.stringify({ error: error instanceof Error ? error.message : error }, null, 2)}`); + task.reject(error instanceof Error ? error : new Error(String(error))); + } } + } finally { + this.isProcessingQueue = false; + logger.debug("Queue processing completed"); } - this.processingQueue = false; } /** * Processes the test script status by sending the screenshot (and optional instruction) to the LLM, * then updating the test script state with any changes. */ - private async processTestScriptStatus( - base64Image: string, - userInstruction?: string - ): Promise { - logger.debug( - `Invoking checkTestScriptStatus. Previous response id: ${this.previous_response_id}; Image length: ${base64Image.length}` - ); - + private async processTestScriptReview(base64Image: string, userInstruction?: string): Promise { + logger.debug(`Processing TestScriptReviewAgent with Previous Response ID: ${this.previous_response_id}`); + // If we don't already have a test_script_state, just parse blank structure if (!this.test_script_state) { this.test_script_state = { steps: [] }; - logger.warn("No previous test_script_state found, creating empty state."); + logger.warn("No previous test script state found, creating empty state"); } - // Build the input messages starting with the system prompt. - const inputMessages: Array = [ - { role: "system", content: TEST_SCRIPT_REVIEW_PROMPT }, - ]; - - // Construct the user message content. - const userContent: Array = []; - if (userInstruction) { - userContent.push({ - type: "input_text", - text: "Context: " + userInstruction, - }); + if (!this.screenshotUtils) { + throw new Error("Screenshot utils not initialized. Call instantiateAgent first."); } - userContent.push({ - type: "input_image", - image_url: `data:image/png;base64,${base64Image}`, - detail: "high", - }); - inputMessages.push({ - role: "user", - content: userContent, - }); + try { + // Call the unified AI service - just pass the schema directly! + const response = await openai_service.responseAPI({ + systemPrompt: TEST_SCRIPT_REVIEW_PROMPT, + userMessage: userInstruction ? "Context: " + userInstruction : undefined, + base64Image, + previousResponseId: this.previous_response_id || undefined, + model: this.model, + schema: TEST_SCRIPT_AGENT_JSON_SCHEMA, + schemaName: "test_script_output" + }); - // Call the OpenAI API with the new payload. - const response = await openai.responses.create({ - model: this.model, - input: inputMessages, - previous_response_id: this.previous_response_id || undefined, - text: { - format: { - type: "json_schema", - name: "test_script_output", - schema: { - type: "object", - properties: { - steps: { - type: "array", - items: { - type: "object", - properties: { - step_number: { type: "number" }, - status: { - type: "string", - enum: ["pending", "Pass", "Fail"], - }, - step_reasoning: { type: "string" }, - }, - required: ["step_number", "status", "step_reasoning"], - additionalProperties: false, - }, - }, - }, - required: ["steps"], - additionalProperties: false, - }, - }, - }, - }); + logger.debug(`TestScriptReviewAgent Response received with Response ID: ${response.id}`); - logger.debug(`Response output text: ${response.output_text}`); + // Update previous response id if configured to do so + if (this.includePreviousResponse) { + this.previous_response_id = response.id; + } - // Conditionally update the previous response id based on the config setting. - if (this.includePreviousResponse) { - this.previous_response_id = response.id; - } + // Parse the new state + const newState: TestScriptState = JSON.parse(response.output_text!); + const oldSteps = this.test_script_state.steps; + + // Determine if any steps changed status + const changedSteps = getStepsWithStatusChange(oldSteps, newState.steps); + + // Save screenshot if there were status changes + let screenshotPath: string | undefined; + if (changedSteps.size > 0) { + screenshotPath = await this.screenshotUtils.saveScreenshot(base64Image); + } - // Parse the new steps from the model - const newState: TestScriptState = JSON.parse(response.output_text); + // Update image paths for all steps + updateStepImagePaths(oldSteps, newState.steps, changedSteps, screenshotPath); - // Ensure the run folder exists (it should be set during instantiateAgent) - if (!this.runFolder) { - this.runFolder = uuidv4(); - const runFolderPath = path.join( - process.cwd(), - "..", - "frontend", - "public", - "test_results", - this.runFolder - ); - fs.mkdirSync(runFolderPath, { recursive: true }); - logger.debug(`Run folder created: ${runFolderPath}`); - } + // Update internal state + this.test_script_state = newState; - // Compare old vs. new test script states to determine if any step transitioned from "pending" -> "Pass"/"Fail". - const oldSteps = this.test_script_state ? this.test_script_state.steps : []; - const shouldSaveScreenshot = oldSteps.some((oldStep) => { - const newStep = newState.steps.find( - (s) => s.step_number === oldStep.step_number - ); - return ( - newStep && - oldStep.status === "pending" && - (newStep.status === "Pass" || newStep.status === "Fail") - ); - }); + const updatedJson = JSON.stringify(this.test_script_state); + logger.debug(`Test Script State Updated:\n${JSON.stringify({ + stepsCount: this.test_script_state.steps.length, + changedStepsCount: changedSteps.size + }, null, 2)}`); + return updatedJson; - if (shouldSaveScreenshot) { - // Save the screenshot under the run folder within /public/test_results - const screenshotFilename = uuidv4() + ".png"; - const screenshotPathLocal = path.join( - process.cwd(), - "..", - "frontend", - "public", - "test_results", - this.runFolder, - screenshotFilename - ); - try { - const bufferData = Buffer.from(base64Image, "base64"); - fs.writeFileSync(screenshotPathLocal, new Uint8Array(bufferData)); - logger.debug(`Screenshot saved to: ${screenshotPathLocal}`); - } catch (err) { - logger.error("Error saving screenshot", err); - } - - // Iterate through steps and attach the screenshot path only for those with a status change. - for (const newStep of newState.steps) { - const oldStep = oldSteps.find( - (s) => s.step_number === newStep.step_number - ); - if (oldStep) { - if ( - oldStep.status === "pending" && - (newStep.status === "Pass" || newStep.status === "Fail") - ) { - newStep.image_path = - "/test_results/" + this.runFolder + "/" + screenshotFilename; - } else if (oldStep.image_path) { - newStep.image_path = oldStep.image_path; - } - } - } - } else { - // No status change detected; simply carry over any existing image paths. - for (const newStep of newState.steps) { - const oldStep = oldSteps.find( - (s) => s.step_number === newStep.step_number - ); - if (oldStep && oldStep.image_path) { - newStep.image_path = oldStep.image_path; - } - } + } catch (error) { + logger.error("Test script review processing failed", { + error: error instanceof Error ? error.message : error + }); + throw new Error(`Failed to process test script status: ${error instanceof Error ? error.message : 'Unknown error'}`); } - - // Update our internal test_script_state with the new state - this.test_script_state = newState; - - // Return the entire updated JSON as a string - const updatedJson = JSON.stringify(this.test_script_state); - logger.debug(`Updated test_script_state: ${updatedJson}`); - return updatedJson; } } diff --git a/cua-server/src/handlers/cua-loop-handler.ts b/cua-server/src/handlers/cua-loop-handler.ts index fa284e3..caa0b51 100644 --- a/cua-server/src/handlers/cua-loop-handler.ts +++ b/cua-server/src/handlers/cua-loop-handler.ts @@ -5,9 +5,8 @@ import logger from "../utils/logger"; import { computerUseLoop } from "../lib/computer-use-loop"; import { Socket } from "socket.io"; import TestScriptReviewAgent from "../agents/test-script-review-agent"; -import { setupCUAModel } from "../services/openai-cua-client"; +import { cua_service, CUAModelInput } from "../services/openai-cua-service"; import { LoginService } from "../services/login-service"; -import { ModelInput } from "../services/openai-cua-client"; // Read viewport dimensions from .env file with defaults if not set const displayWidth: number = parseInt(process.env.DISPLAY_WIDTH || "1024", 10); @@ -23,7 +22,11 @@ export async function cuaLoopHandler( loginRequired: boolean, userInfo?: string ) { - logger.info("Starting test script execution..."); + logger.info("Starting test script execution", { + url, + loginRequired, + socketId: socket.id + }); socket.emit("message", "Starting test script execution..."); try { @@ -33,8 +36,7 @@ export async function cuaLoopHandler( args: ["--disable-extensions", "--disable-file-system"], }); - logger.debug("Creating new browser instance..."); - + logger.debug("Browser launched successfully"); socket.emit("message", "Launching browser..."); const page = await browser.newPage(); @@ -44,17 +46,20 @@ export async function cuaLoopHandler( // Set viewport dimensions using env values await page.setViewportSize({ width: displayWidth, height: displayHeight }); + logger.debug(`Viewport set:\n${JSON.stringify({ width: displayWidth, height: displayHeight }, null, 2)}`); // Navigate to the provided URL from the form. await page.goto(url); + logger.debug(`Navigated to URL: ${url}`); // wait for 2 seconds await page.waitForTimeout(2000); // Capture an initial screenshot. const screenshot_before_login = await page.screenshot(); - const screenshot_before_login_base64 = - screenshot_before_login.toString("base64"); + const screenshot_before_login_base64 = screenshot_before_login.toString("base64"); + + logger.debug("Initial screenshot captured"); // Asynchronously check the status of the test script. const testScriptReviewResponsePromise = @@ -62,82 +67,69 @@ export async function cuaLoopHandler( // Asynchronously emit the test script review response to the socket. testScriptReviewResponsePromise.then((testScriptReviewResponse) => { - logger.debug( - "Sending screenshot before login to Test Script Review Agent" - ); + logger.debug("Sending initial screenshot to TestScriptReviewAgent"); socket.emit("testscriptupdate", testScriptReviewResponse); - logger.trace( - `Initial test script state emitted: ${JSON.stringify( - testScriptReviewResponse, - null, - 2 - )}` - ); + }).catch((error) => { + logger.error("Initial test script review failed", { + error: error instanceof Error ? error.message : error + }); }); // Await till network is idle. await page.waitForTimeout(2000); - let modelInput: ModelInput; + let cua_model_input: CUAModelInput; if (loginRequired) { // Note to the developer: Different applications will need their own login handlers. - logger.debug("Login required... proceeding with login."); + logger.info("Processing login requirement"); socket.emit("message", "Login required... proceeding with login."); const loginService = new LoginService(); await loginService.fillin_login_credentials(username, password, page); - logger.trace( - "Login execution completed... proceeding with test script execution." - ); + logger.debug("Login credentials filled"); // wait for 5 seconds await page.waitForTimeout(5000); const screenshot_after_login = await page.screenshot(); - const screenshot_after_login_base64 = - screenshot_after_login.toString("base64"); + const screenshot_after_login_base64 = screenshot_after_login.toString("base64"); + + logger.debug("Post-login screenshot captured", { + size: screenshot_after_login_base64.length + }); // Asynchronously check the status of the test script. const testScriptReviewResponsePromise_after_login = - testCaseReviewAgent.checkTestScriptStatus( - screenshot_after_login_base64 - ); + testCaseReviewAgent.checkTestScriptStatus(screenshot_after_login_base64); // Asynchronously emit the test script review response to the socket. testScriptReviewResponsePromise_after_login.then( (testScriptReviewResponse) => { - logger.debug( - "Sending screenshot after login to Test Script Review Agent" - ); - // Emit the test script review response to the socket. + logger.debug("Sending post-login screenshot to TestScriptReviewAgent"); socket.emit("testscriptupdate", testScriptReviewResponse); - logger.trace( - `Test script state emitted after login: ${JSON.stringify( - testScriptReviewResponse, - null, - 2 - )}` - ); } - ); + ).catch((error) => { + logger.error("Post-login test script review failed", { + error: error instanceof Error ? error.message : error + }); + }); await loginService.click_login_button(page); - socket.emit( - "message", - "Login step executed... proceeding with test script execution." - ); + socket.emit("message", "Login step executed... proceeding with test script execution."); + logger.info("Login process completed"); - modelInput = { + cua_model_input = { screenshotBase64: screenshot_after_login_base64, previousResponseId: undefined, lastCallId: undefined, }; } else { // If login is not required, use the screenshot before login. - modelInput = { + logger.debug("No login required - using initial screenshot"); + cua_model_input = { screenshotBase64: screenshot_before_login_base64, previousResponseId: undefined, lastCallId: undefined, @@ -145,17 +137,14 @@ export async function cuaLoopHandler( } // Start with an initial call (without a screenshot or call_id) + logger.debug("Setting up CUA model"); + const userInfoStr = userInfo ?? ""; - let initial_response = await setupCUAModel(systemPrompt, userInfoStr); - - logger.debug( - `Initial response from CUA model: ${JSON.stringify( - initial_response, - null, - 2 - )}` - ); - logger.debug(`Starting computer use loop...`); + let initial_response = await cua_service.setupCUAModel(systemPrompt, userInfoStr); + + logger.info("CUA model setup completed", { + responseId: initial_response.id + }); const response = await computerUseLoop( page, @@ -180,6 +169,11 @@ export async function cuaLoopHandler( }); } } catch (error) { - logger.error(`Error during playwright loop: ${error}`); + logger.error("Test script execution failed", { + error: error instanceof Error ? error.message : error, + url, + loginRequired + }); + socket.emit("message", "Test script execution failed. Please check the logs."); } } diff --git a/cua-server/src/handlers/test-case-initiation-handler.ts b/cua-server/src/handlers/test-case-initiation-handler.ts index 6c7a04a..2b5f99b 100644 --- a/cua-server/src/handlers/test-case-initiation-handler.ts +++ b/cua-server/src/handlers/test-case-initiation-handler.ts @@ -1,7 +1,7 @@ import { Socket } from "socket.io"; import logger from "../utils/logger"; import TestCaseAgent from "../agents/test-case-agent"; -import { convertTestCaseToSteps, TestCase } from "../utils/testCaseUtils"; +import { convertTestCaseToSteps, TestCase } from "../utils/test-case-utils"; import { cuaLoopHandler } from "./cua-loop-handler"; import TestScriptReviewAgent from "../agents/test-script-review-agent"; @@ -9,7 +9,7 @@ export async function handleTestCaseInitiated( socket: Socket, data: any ): Promise { - logger.debug(`Received testCaseInitiated with data: ${JSON.stringify(data)}`); + logger.debug(`Received Test Case Initiated with data:\n${JSON.stringify(data)}`); try { const { testCase, url, userName, password, userInfo } = data as { testCase: string; @@ -29,11 +29,11 @@ export async function handleTestCaseInitiated( ); // Create system prompt by combining form inputs. - const msg = `${testCase} URL: ${url} User Name: ${userName} Password: *********\n USER INFO:\n${userInfo}`; + const msg = `${testCase} URL: ${url} User Name: ${userName} Password: *********\nUSER INFO:\n${userInfo}`; const testCaseAgent = new TestCaseAgent(loginRequired); - const testCaseResponse = await testCaseAgent.invokeResponseAPI(msg); + const testCaseResponse = await testCaseAgent.generateTestCases(msg); const testCaseJson = JSON.stringify(testCaseResponse); // Create a new test case review agent. @@ -58,15 +58,13 @@ export async function handleTestCaseInitiated( // Set the test case review agent in the socket. socket.data.testCaseReviewAgent = testCaseReviewAgent; - - logger.debug(`Cleaned test case: ${testCaseJson}`); + logger.trace(`Cleaned Test Cases\n${testCaseJson}`); socket.emit("testcases", testCaseJson); socket.emit("message", "Task steps created."); const testScript = convertTestCaseToSteps(testCaseResponse as TestCase); - - logger.debug(`Test script: ${testScript}`); + logger.debug(`Test Script:\n${testScript}`); // Start the test execution using the provided URL. // Pass the test case review agent to the cuaLoopHandler. diff --git a/cua-server/src/handlers/user-messages-handler.ts b/cua-server/src/handlers/user-messages-handler.ts index bf105cb..22595a3 100644 --- a/cua-server/src/handlers/user-messages-handler.ts +++ b/cua-server/src/handlers/user-messages-handler.ts @@ -1,13 +1,16 @@ -import { ModelInput } from "../services/openai-cua-client"; import logger from "../utils/logger"; import { Socket } from "socket.io"; -import { sendInputToModel } from "../services/openai-cua-client"; +import { cua_service, CUAModelInput } from "../services/openai-cua-service"; import { computerUseLoop } from "../lib/computer-use-loop"; + export async function handleSocketMessage( socket: Socket, msg: string ): Promise { - logger.debug(`Server received message: ${msg}`); + logger.debug("Handling socket message", { + messageLength: msg.length, + socketId: socket.id + }); // A message from user resumes the test script or instructs model to take an action. const page = socket.data.page; @@ -18,13 +21,15 @@ export async function handleSocketMessage( const screenshotBase64 = screenshot.toString("base64"); const lastCallId = socket.data.lastCallId; - const modelInput: ModelInput = { + const modelInput: CUAModelInput = { screenshotBase64: screenshotBase64, previousResponseId: previousResponseId, lastCallId: lastCallId, }; - const resumeResponse = await sendInputToModel(modelInput, msg); + logger.debug("Sending input to CUA model"); + + const resumeResponse = await cua_service.sendScreenshotToModel(modelInput, msg); const response = await computerUseLoop( page, @@ -38,6 +43,8 @@ export async function handleSocketMessage( ); if (messageResponse.length > 0) { + logger.debug("Emitting model messages to socket"); + messageResponse.forEach((message: any) => { if (Array.isArray(message.content)) { message.content.forEach((contentBlock: any) => { diff --git a/cua-server/src/index.ts b/cua-server/src/index.ts index 2dd1a05..22f8439 100644 --- a/cua-server/src/index.ts +++ b/cua-server/src/index.ts @@ -37,9 +37,7 @@ io.on("connection", (socket) => { // Log all events socket.onAny((event, msg) => { - logger.trace( - `Received event: ${event} with message: ${JSON.stringify(msg)}` - ); + logger.trace(`Received event: ${event} with message:\n${JSON.stringify(msg, null, 2)}`); }); // Handle incoming messages diff --git a/cua-server/src/lib/computer-use-loop.ts b/cua-server/src/lib/computer-use-loop.ts index 7c80708..6d77f82 100644 --- a/cua-server/src/lib/computer-use-loop.ts +++ b/cua-server/src/lib/computer-use-loop.ts @@ -1,9 +1,6 @@ // lib/modules/computer-use-loop.ts import { Page } from "playwright"; -import { - sendInputToModel, - sendFunctionCallOutput, -} from "../services/openai-cua-client"; +import { cua_service } from "../services/openai-cua-service"; import { handleModelAction } from "../handlers/action-handler"; import logger from "../utils/logger"; import { Socket } from "socket.io"; @@ -21,17 +18,20 @@ export async function computerUseLoop( switchedToNewTab: boolean = false // <-- Flag to ensure recursion happens only once for a new tab. ) { await page.screenshot({ path: "screenshot.png" }); + logger.debug("Starting computer use loop", { + responseId: response.id, + socketId: socket.id + }); + while (true) { // Check if the test case status is 'fail'. if (socket.data.testCaseStatus === "fail") { - logger.debug("Test case failed. Exiting the computer use loop."); - + logger.info("Test case failed - exiting computer use loop"); return response; } if (socket.data.testCaseStatus === "pass") { - logger.debug("Test case passed. Exiting the computer use loop."); - + logger.info("Test case passed - exiting computer use loop"); return response; } @@ -47,7 +47,8 @@ export async function computerUseLoop( if (functionCalls.length > 0) { for (const funcCall of functionCalls) { if (funcCall.name === "mark_done") { - response = await sendFunctionCallOutput( + logger.info("Processing mark_done function call"); + response = await cua_service.sendFunctionResult( funcCall.call_id, response.id, { @@ -66,11 +67,9 @@ export async function computerUseLoop( socket.data.previousResponseId = response.id; if (computerCalls.length === 0) { - logger.debug("No computer call found. Final output from model:"); + logger.debug("No computer call found in model response"); response.output.forEach((item: any) => { - logger.debug( - `Output from the model - ${JSON.stringify(item, null, 2)}` - ); + logger.trace("Output from the Model", `${JSON.stringify(item, null, 2)}`); }); const messageResponse = response.output.filter( @@ -81,20 +80,16 @@ export async function computerUseLoop( // Check if the response is a message. // NOTE: This is unused in this demo as we force the model to call tools with tool_choice = required // Update this logic to handle messages from the model if needed for your use case if (messageResponse.length > 0) { - logger.debug( - "Response is a message. Trying to get answer from CUA Control Agent." - ); + logger.debug("Processing message response from CUA model"); const message = messageResponse[0].content[0].text; - logger.debug(`Message from the CUA model: ${message}`); + logger.debug("CUA model message", { message }); if (!message.call_id) { - logger.debug( - `No call id found in the message. Exiting the computer use loop.` - ); + logger.warn("No call id found in message - exiting computer use loop"); } - response = await sendInputToModel( + response = await cua_service.sendScreenshotToModel( { screenshotBase64: "", previousResponseId: response.id, @@ -104,9 +99,7 @@ export async function computerUseLoop( ); } else { // If its not a computer_call, we just return the response. - logger.debug( - `Response for the model is neither a computer_call nor a message. Returning the response.` - ); + logger.debug("Response is neither computer_call nor message - returning response"); return response; } } else { @@ -122,7 +115,7 @@ export async function computerUseLoop( : "No reasoning provided"; socket.emit("message", `${summaryText}`); - logger.debug(`Model reasoning: ${summaryText}`); + logger.debug(`Model Reasoning:\n${JSON.stringify({ summary: summaryText })}`); }); } @@ -135,7 +128,7 @@ export async function computerUseLoop( computerCall.pending_safety_checks.length > 0 ) { const safetyCheck = computerCall.pending_safety_checks[0]; - logger.error(`Safety check detected: ${safetyCheck.message}`); + logger.error(`Safety check detected:\n${JSON.stringify({ message: safetyCheck.message }, null, 2)}`); socket.emit("message", `Safety check detected: ${safetyCheck.message}`); socket.emit( "message", @@ -151,11 +144,14 @@ export async function computerUseLoop( const action = (computerCall as any).action; + logger.debug(`Processing Computer Action: ${action?.type}`); + // Take a screenshot of the page before the action is executed. if (["click"].includes(action?.type)) { const screenshotBuffer = await page.screenshot(); const screenshotBase64 = screenshotBuffer.toString("base64"); + logger.debug("Sending screenshot to test script review agent"); const testScriptReviewResponsePromise = testCaseReviewAgent.checkTestScriptStatus(screenshotBase64); // Asynchronously emit the test script review response to the socket. @@ -164,9 +160,9 @@ export async function computerUseLoop( socket.emit("testscriptupdate", testScriptReviewResponse); }) .catch((error) => { - logger.error( - "Error during test script review: {error: " + error + "}" - ); + logger.error("Test script review failed", { + error: error instanceof Error ? error.message : error + }); socket.emit("testscriptupdate", { error: "Review processing failed.", }); @@ -185,26 +181,24 @@ export async function computerUseLoop( if (pages.length > 1 && !switchedToNewTab) { // Assume the new tab is the last page. const newPage = pages[pages.length - 1]; - logger.debug( - "New tab detected. Switching context to the new tab (recursion will happen only once)." - ); + logger.info("New tab detected - switching context"); // Continue with your logic using newPage... const viewport = newPage.viewportSize(); - logger.trace( - `Viewport dimensions of new page: ${viewport?.width}, ${viewport?.height}` - ); + logger.debug(`New page viewport:\n${JSON.stringify({ + width: viewport?.width, + height: viewport?.height + }, null, 2)}`); if ( !viewport || viewport.width !== defaultWidth || viewport.height !== defaultHeight ) { - logger.debug( - `Resetting viewport size from (${viewport?.width || "undefined"}, ${ - viewport?.height || "undefined" - }) to default (${defaultWidth}, ${defaultHeight}).` - ); + logger.debug(`Resetting viewport size:\n${JSON.stringify({ + from: `${viewport?.width || "undefined"}x${viewport?.height || "undefined"}`, + to: `${defaultWidth}x${defaultHeight}` + }, null, 2)}`); await newPage.setViewportSize({ width: defaultWidth, height: defaultHeight, @@ -216,16 +210,15 @@ export async function computerUseLoop( const screenshotBase64 = screenshotBuffer.toString("base64"); // Send the screenshot back as a computer_call_output. - response = (await sendInputToModel({ + response = (await cua_service.sendScreenshotToModel({ screenshotBase64, previousResponseId: response.id, lastCallId, })) as any; - logger.info( - "Recursively calling computerUseLoop with new page context." - ); - logger.trace(`Response: ${JSON.stringify(response, null, 2)}`); + logger.info("Recursively calling computerUseLoop with new page context"); + logger.trace(`CUAModelResponse: ${JSON.stringify(response, null, 2)}`); + // Recursively call the computerUseLoop with the new page. response = await computerUseLoop( @@ -240,13 +233,13 @@ export async function computerUseLoop( } let screenshotBuffer, screenshotBase64; - logger.trace("Capturing updated screenshot..."); + logger.debug("Capturing updated screenshot"); screenshotBuffer = await getScreenshotWithRetry(page); screenshotBase64 = screenshotBuffer.toString("base64"); // Send the screenshot back as a computer_call_output. - response = (await sendInputToModel({ + response = (await cua_service.sendScreenshotToModel({ screenshotBase64, previousResponseId: response.id, lastCallId, @@ -264,7 +257,11 @@ async function getScreenshotWithRetry( const screenshot = await page.screenshot(); return screenshot; } catch (error) { - logger.error(`Attempt ${attempt} - Error capturing screenshot: ${error}`); + logger.error("Screenshot capture failed", { + attempt, + maxRetries: retries, + error: error instanceof Error ? error.message : error + }); if (attempt === retries) { throw error; } diff --git a/cua-server/src/lib/constants.ts b/cua-server/src/lib/constants.ts index beaba8a..ecc8fba 100644 --- a/cua-server/src/lib/constants.ts +++ b/cua-server/src/lib/constants.ts @@ -6,7 +6,7 @@ export const PROMPT_WITH_LOGIN = ` The first 3 steps are always: 1. Open the browser and navigate to the login URL. 2. Enter username and password. - 3. Click *Log In* and verify successful sign‑in. + 3. Click *Log In* and verify successful sign-in. Then add the actual test steps the user asked for. @@ -15,7 +15,7 @@ export const PROMPT_WITH_LOGIN = ` "steps": [ { "step_number": 1, "step_instructions": "Open a web browser and navigate to the login URL: ", "status": "pending" }, { "step_number": 2, "step_instructions": "Enter the username '' and password '********'.", "status": "pending" }, - { "step_number": 3, "step_instructions": "Click the 'Log In' button and verify successful sign‑in.", "status": "pending" }, + { "step_number": 3, "step_instructions": "Click the 'Log In' button and verify successful sign‑in.", "status": "pending" }, { "step_number": 4, "step_instructions": "From the home page, click the 'Accounts' tab.", "status": "pending" }, { "step_number": 5, "step_instructions": "Click 'New' to create a new account.", "status": "pending" }, { "step_number": 6, "step_instructions": "Fill the form with mock data (e.g., Account Name 'Test Account').", "status": "pending" }, @@ -48,6 +48,17 @@ export const PROMPT_WITHOUT_LOGIN = ` } `; +// CUA system prompt template +export const CUA_SYSTEM_PROMPT = `You are a testing agent. You will be given a list of instructions with steps to test a web application. +You will need to navigate the web application and perform the actions described in the instructions. +Try to accomplish the provided task in the simplest way possible. +Once you believe your are done with all the tasks required or you are blocked and cannot progress +(for example, you have tried multiple times to acommplish a task but keep getting errors or blocked), +use the mark_done tool to let the user know you have finished the tasks. +You do not need to authenticate on user's behalf, the user will authenticate and your flow starts after that. + `; + + export const TEST_SCRIPT_REVIEW_PROMPT = ` You are a test script review agent. You will be given a set of test cases in the format below and screenshots of the test results. diff --git a/cua-server/src/services/login-service.ts b/cua-server/src/services/login-service.ts index 7a35b60..36b6650 100644 --- a/cua-server/src/services/login-service.ts +++ b/cua-server/src/services/login-service.ts @@ -30,7 +30,6 @@ export class LoginService { .first() .fill(password, { timeout: 5_000 }); - logger.debug("Login credentials filled in."); return true; } catch (error) { logger.error("❌ Error filling login credentials:", error); diff --git a/cua-server/src/services/openai-cua-client.ts b/cua-server/src/services/openai-cua-client.ts deleted file mode 100644 index 217bc50..0000000 --- a/cua-server/src/services/openai-cua-client.ts +++ /dev/null @@ -1,162 +0,0 @@ -import OpenAI from "openai"; -import logger from "../utils/logger"; - -const openai = new OpenAI({ - apiKey: process.env.OPENAI_API_KEY, -}); - -// Environment specific instructions for the CUA model e.g., MacOS specific actions CMD+A vs CTRL+A -const envInstructions = process.env.ENV_SPECIFIC_INSTRUCTIONS || ""; - -const cuaPrompt = `You are a testing agent. You will be given a list of instructions with steps to test a web application. -You will need to navigate the web application and perform the actions described in the instructions. -Try to accomplish the provided task in the simplest way possible. -Once you believe your are done with all the tasks required or you are blocked and cannot progress -(for example, you have tried multiple times to acommplish a task but keep getting errors or blocked), -use the mark_done tool to let the user know you have finished the tasks. -You do not need to authenticate on user's behalf, the user will authenticate and your flow starts after that.`; - -// Helper: Read display dimensions from env -const displayWidth: number = parseInt(process.env.DISPLAY_WIDTH || "1024", 10); -const displayHeight: number = parseInt(process.env.DISPLAY_HEIGHT || "768", 10); - -const tools = [ - { - type: "computer_use_preview", - display_width: displayWidth, - display_height: displayHeight, - environment: "browser", - }, - { - type: "function", - name: "mark_done", - description: - "Use this tool to let the user know you have finished the tasks.", - parameters: {}, - }, -]; - -interface OpenAIResponse { - id: string; - output: Array; -} - -export interface ModelInput { - screenshotBase64: string; - previousResponseId?: string; - lastCallId?: string; -} - -// Helper to construct and send a request to the CUA model -async function callCUAModel(input: any[], previousResponseId?: string) { - logger.trace("Sending request body to the model..."); - - const requestBody: any = { - model: "computer-use-preview", - tools, - input, - reasoning: { - generate_summary: "concise", - }, - truncation: "auto", - tool_choice: "required", - }; - - if (previousResponseId) { - requestBody.previous_response_id = previousResponseId; - logger.trace( - `Adding previous response ID to the request body: ${previousResponseId}` - ); - } - - logger.trace( - `Calling CUA model API with the request body: ${JSON.stringify( - requestBody, - null, - 2 - )}` - ); - const response = await openai.responses.create(requestBody); - - logger.trace("Received response from the model."); - return response; -} - -/** - * Sends input (or screenshot output) to the OpenAI model. - * If no lastCallId is provided, it sends an initial query. - */ -export async function sendInputToModel( - { screenshotBase64, previousResponseId, lastCallId }: ModelInput, - userMessage?: string -): Promise { - logger.trace("Building image input for the model..."); - const input: any[] = []; - - if (lastCallId) { - // This is a follow-up call with a screenshot - logger.trace( - `Adding screenshot to the input with the call ID: ${lastCallId}` - ); - input.push({ - call_id: lastCallId, - type: "computer_call_output", - output: { - type: "input_image", - image_url: `data:image/png;base64,${screenshotBase64}`, - }, - }); - } - - if (userMessage) { - input.push({ - role: "user", - content: userMessage, - }); - } - - return callCUAModel(input, previousResponseId); -} - -export async function sendFunctionCallOutput( - callId: string, - previousResponseId: string, - outputObj: object = {} -): Promise { - const input = [ - { - call_id: callId, - type: "function_call_output", - output: JSON.stringify(outputObj), - }, - ]; - - return callCUAModel(input, previousResponseId); -} - -export async function setupCUAModel(systemPrompt: string, userInfo: string) { - logger.trace("Setting up CUA model..."); - const input: any[] = []; - - const cua_initiation_prompt = `${cuaPrompt} - ${ - envInstructions - ? "Environment specific instructions: " + envInstructions - : "" - } - `; - - logger.trace(`CUA system prompt: ${cua_initiation_prompt}`); - - input.push({ - role: "system", - content: cua_initiation_prompt, - }); - - input.push({ - role: "user", - content: `INSTRUCTIONS:\n${systemPrompt}\n\nUSER INFO:\n${userInfo}`, - }); - - return callCUAModel(input); -} diff --git a/cua-server/src/services/openai-cua-service.ts b/cua-server/src/services/openai-cua-service.ts new file mode 100644 index 0000000..aa4303a --- /dev/null +++ b/cua-server/src/services/openai-cua-service.ts @@ -0,0 +1,186 @@ +import { OpenAI, AzureOpenAI } from "openai"; +import logger from "../utils/logger"; +import { CUA_SYSTEM_PROMPT } from "../lib/constants"; + +// If USE_OPENAI=true, use OpenAI, otherwise use Azure-OpenAI +const cua_client = process.env.USE_OPENAI === 'true' + ? new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, + }) + : new AzureOpenAI({ + apiKey: process.env.AZURE_API_KEY, + apiVersion: process.env.AZURE_API_VERSION, + endpoint: process.env.AZURE_ENDPOINT, + }); + +// Environment specific instructions for the CUA model +const envInstructions = process.env.ENV_SPECIFIC_INSTRUCTIONS || ""; + +// Display dimensions for computer use +const displayWidth: number = parseInt(process.env.DISPLAY_WIDTH || "1024", 10); +const displayHeight: number = parseInt(process.env.DISPLAY_HEIGHT || "768", 10); + +// Computer use tools configuration +const CUA_TOOLS = [ + { + type: "computer_use_preview", + display_width: displayWidth, + display_height: displayHeight, + environment: "browser", + }, + { + type: "function", + name: "mark_done", + description: "Use this tool to let the user know you have finished the tasks.", + parameters: {}, + }, +]; + +// CUA-specific interfaces +export interface CUAModelInput { + screenshotBase64: string; + previousResponseId?: string; + lastCallId?: string; +} + +export interface CUAModelResponse { + id: string; + output: Array; +} + +class OpenAICUAService { + private readonly model: string; + + constructor() { + // Use different model names based on provider + if (process.env.USE_OPENAI === 'true') { + this.model = process.env.OPENAI_COMPUTER_USE_MODEL || 'computer-use-preview'; + } else { + this.model = process.env.AZURE_COMPUTER_USE_MODEL_DEPLOYMENT_NAME || 'computer-use-preview'; + } + } + + /** + * Initialize CUA model with system and user prompts + */ + async setupCUAModel(systemPrompt: string, userInfo: string): Promise { + logger.debug(`Started CUA Model Setup:\n${JSON.stringify({ + systemPromptLength: systemPrompt.length, + userInfoLength: userInfo.length + }, null, 2)}`); + + const enhancedSystemPrompt = `${CUA_SYSTEM_PROMPT} + ${envInstructions ? `Environment specific instructions: ${envInstructions}` : ""}`; + + const input = [ + { + role: "system", + content: enhancedSystemPrompt, + }, + { + role: "user", + content: `INSTRUCTIONS:\n${systemPrompt}\n\nUSER INFO:\n${userInfo}`, + }, + ]; + + return this.callCUAModel(input); + } + + /** + * Send screenshot and user input to CUA model + */ + async sendScreenshotToModel( + { screenshotBase64, previousResponseId, lastCallId }: CUAModelInput, + userMessage?: string + ): Promise { + logger.debug("Sending screenshot to CUA model", { + screenshotSize: screenshotBase64.length, + hasCallId: !!lastCallId, + hasUserMessage: !!userMessage, + hasPreviousResponse: !!previousResponseId + }); + + const input: any[] = []; + + if (lastCallId) { + input.push({ + call_id: lastCallId, + type: "computer_call_output", + output: { + type: "input_image", + image_url: `data:image/png;base64,${screenshotBase64}`, + }, + }); + } + + if (userMessage) { + input.push({ + role: "user", + content: userMessage, + }); + } + + return this.callCUAModel(input, previousResponseId); + } + + /** + * Send function call result back to CUA model + */ + async sendFunctionResult( + callId: string, + previousResponseId: string, + resultData: object = {} + ): Promise { + logger.debug(`Sending function result to CUA Model:\n${JSON.stringify({ + callId, + previousResponseId, + resultData + }, null, 2)}`); + + const input = [ + { + call_id: callId, + type: "function_call_output", + output: JSON.stringify(resultData), + }, + ]; + + return this.callCUAModel(input, previousResponseId); + } + + /** + * Core CUA model call with proper request configuration + */ + async callCUAModel(input: any[], previousResponseId?: string): Promise { + logger.debug("Sending request to CUA model"); + + const requestBody: any = { + model: this.model, + tools: CUA_TOOLS, + input, + reasoning: { + generate_summary: "concise", + }, + truncation: "auto", + tool_choice: "required", + }; + + if (previousResponseId) { + requestBody.previous_response_id = previousResponseId; + logger.debug(`Including previous ResponseID: ${ previousResponseId }`); + } + + try { + const response = await cua_client.responses.create(requestBody); + logger.debug(`CUA model response received: ${ response.id }`); + return response as CUAModelResponse; + } catch (error) { + logger.error("CUA model call failed", { error: error instanceof Error ? error.message : error }); + throw new Error(`CUA model call failed: ${error instanceof Error ? error.message : 'Unknown error'}`); + } + } + +} + +// Export singleton instance +export const cua_service = new OpenAICUAService(); \ No newline at end of file diff --git a/cua-server/src/services/openai-service.ts b/cua-server/src/services/openai-service.ts new file mode 100644 index 0000000..7168ba0 --- /dev/null +++ b/cua-server/src/services/openai-service.ts @@ -0,0 +1,161 @@ +import { OpenAI, AzureOpenAI } from "openai"; +import logger from "../utils/logger"; + +// If USE_OPENAI=true, use OpenAI, otherwise use Azure-OpenAI +const client = process.env.USE_OPENAI === 'true' + ? new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, + }) + : new AzureOpenAI({ + apiKey: process.env.AZURE_API_KEY, + apiVersion: process.env.AZURE_API_VERSION, + endpoint: process.env.AZURE_ENDPOINT, + }); + +// Unified OpenAI service interface +export interface ResponseAPIInput { + systemPrompt: string; + userMessage?: string; + base64Image?: string; + previousResponseId?: string; + model?: string; + schema?: object; + schemaName?: string; +} + +export interface ResponseAPIResponse { + id: string; + output: Array; + output_text?: string; + output_parsed?: any; +} + +class OpenAIService { + private buildUserContent(userMessage?: string, base64Image?: string): any { + // If we have both text and image, create multimodal content + if (userMessage && base64Image) { + return [ + { + type: "input_text", + text: userMessage, + }, + { + type: "input_image", + image_url: `data:image/png;base64,${base64Image}`, + detail: "high", + } + ]; + } + + // If only text + if (userMessage) { + return userMessage; + } + + // If only image + if (base64Image) { + return [ + { + type: "input_image", + image_url: `data:image/png;base64,${base64Image}`, + detail: "high", + } + ]; + } + + return userMessage; + } + + /** + * Unified response API call that handles all scenarios + */ + async responseAPI({ + systemPrompt, + userMessage, + base64Image, + previousResponseId, + model, + schema, + schemaName + }: ResponseAPIInput): Promise { + + const operationName = schema ? `${schemaName} Agent` : "Basic Response"; + logger.debug(`OpenAI ResponseAPI called for: ${operationName}`); + logger.trace(`OpenAI ResponseAPI called with input:\n${JSON.stringify({ + systemPrompt: systemPrompt, + userMessage: userMessage, + hasBase64Image: !!base64Image, + previousResponseId: previousResponseId || null, + model: model || null, + schemaName: schemaName || null, + }, null, 2)}`); + + const userContent = this.buildUserContent(userMessage, base64Image); + + const requestBody: any = { + model: model, + input: [ + { role: "system", content: systemPrompt }, + { role: "user", content: userContent }, + ], + }; + + // Add JSON schema if provided + if (schema) { + requestBody.text = { + format: { + type: "json_schema", + name: schemaName || "response_output", + schema: schema + } + }; + } + + // Add previous response ID if provided + if (previousResponseId) { + requestBody.previous_response_id = previousResponseId; + } + + logger.trace(`Request Body:\n${JSON.stringify(requestBody, null, 2)}`); + + try { + // First attempt + const response = await client.responses.create(requestBody); + logger.debug(`${operationName} Completed Successfully!\n${JSON.stringify({ + responseId: response.id, + model: model + }, null, 2)}`); + return response as ResponseAPIResponse; + } catch (error) { + logger.error(`${operationName} failed\n${JSON.stringify({ + error: error instanceof Error ? error.message : error, + model: model, + errorType: error instanceof Error ? error.constructor.name : 'Unknown' + }, null, 2)}`); + + // Retry without previousResponseId if it was provided + if (previousResponseId) { + logger.debug("Retrying without PreviousResponseId..."); + delete requestBody.previous_response_id; + + try { + const retryResponse = await client.responses.create(requestBody); + logger.debug(`${operationName} retry succeeded\n${JSON.stringify({ + responseId: retryResponse.id, + model: model + }, null, 2)}`); + return retryResponse as ResponseAPIResponse; + } catch (retryError) { + logger.error(`${operationName} retry also failed\n${JSON.stringify({ + error: retryError instanceof Error ? retryError.message : retryError, + model: model + }, null, 2)}`); + } + } + + throw error; + } + } +} + +export const openai_service = new OpenAIService(); \ No newline at end of file diff --git a/cua-server/src/utils/logger.ts b/cua-server/src/utils/logger.ts index 0d80e2f..2e50731 100644 --- a/cua-server/src/utils/logger.ts +++ b/cua-server/src/utils/logger.ts @@ -1,21 +1,67 @@ import pino from "pino"; +import path from "path"; +import fs from "fs"; const isProduction = process.env.NODE_ENV === "production"; +const logLevel = process.env.LOG_LEVEL || "debug"; -const logger = pino({ - level: process.env.LOG_LEVEL || "info", - ...(isProduction - ? {} - : { - transport: { - target: "pino-pretty", - options: { - colorize: true, - }, - }, +// Ensure logs directory exists +const logsDir = path.join(process.cwd(), "logs"); +if (!fs.existsSync(logsDir)) { + fs.mkdirSync(logsDir, { recursive: true }); +} + +// Clear existing log files on startup +const appLogPath = path.join(logsDir, "app.log"); +const errorLogPath = path.join(logsDir, "error.log"); + +try { + if (fs.existsSync(appLogPath)) { + fs.writeFileSync(appLogPath, ""); + } + if (fs.existsSync(errorLogPath)) { + fs.writeFileSync(errorLogPath, ""); + } +} catch (error) { + console.warn("Warning: Could not clear log files on startup:", error); +} + +const logger = pino( + { + level: logLevel, + }, + pino.multistream([ + // Console output (pretty in development) + { + level: logLevel, + stream: isProduction + ? process.stdout + : pino.transport({ + target: "pino-pretty", + options: { + colorize: true, + }, + }), + }, + // File output + { + level: logLevel, + stream: pino.destination({ + dest: appLogPath, + sync: false, + }), + }, + // Separate error log file + { + level: "error", + stream: pino.destination({ + dest: errorLogPath, + sync: false, }), -}); + }, + ]) +); -logger.info(`Logger initialized with log level: ${logger.level}`); +logger.info(`Logger initialized with log level: ${logLevel}`); export default logger; diff --git a/cua-server/src/utils/screenshot-utils.ts b/cua-server/src/utils/screenshot-utils.ts new file mode 100644 index 0000000..9601f85 --- /dev/null +++ b/cua-server/src/utils/screenshot-utils.ts @@ -0,0 +1,61 @@ +import fs from "fs"; +import path from "path"; +import { v4 as uuidv4 } from "uuid"; +import logger from "./logger"; + +export class ScreenshotUtils { + private runFolder: string; + + constructor(runFolder?: string) { + this.runFolder = runFolder || uuidv4(); + } + + /** + * Gets the current run folder ID. + */ + getRunFolder(): string { + return this.runFolder; + } + + /** + * Sets the run folder ID. + */ + setRunFolder(runFolder: string): void { + this.runFolder = runFolder; + } + + /** + * Ensures the run folder exists, creating it if necessary. + */ + async ensureRunFolder(): Promise { + const runFolderPath = this.getRunFolderPath(); + + if (!fs.existsSync(runFolderPath)) { + fs.mkdirSync(runFolderPath, { recursive: true }); + logger.debug("Run folder created", { path: runFolderPath }); + } + } + + /** + * Gets the full path to the run folder. + */ + getRunFolderPath(): string { + return path.join(process.cwd(), "..", "frontend", "public", "test_results", this.runFolder); + } + + /** + * Saves a screenshot and returns the relative path. + */ + async saveScreenshot(base64Image: string): Promise { + await this.ensureRunFolder(); + + const screenshotFilename = `${uuidv4()}.png`; + const screenshotPathLocal = path.join(this.getRunFolderPath(), screenshotFilename); + const relativePath = `/test_results/${this.runFolder}/${screenshotFilename}`; + + fs.writeFileSync(screenshotPathLocal, Buffer.from(base64Image, "base64")); + logger.debug("Screenshot saved", { relativePath }); + + return relativePath; + } +} \ No newline at end of file diff --git a/cua-server/src/utils/testCaseUtils.ts b/cua-server/src/utils/test-case-utils.ts similarity index 59% rename from cua-server/src/utils/testCaseUtils.ts rename to cua-server/src/utils/test-case-utils.ts index 70e3947..bec653c 100644 --- a/cua-server/src/utils/testCaseUtils.ts +++ b/cua-server/src/utils/test-case-utils.ts @@ -19,6 +19,30 @@ export interface TestCase { steps: TestCaseStep[]; } +// Test case JSON schema for structured output +export const TEST_CASE_JSON_SCHEMA = { + type: "object", + properties: { + steps: { + type: "array", + items: { + type: "object", + properties: { + step_number: { type: "number" }, + step_instructions: { type: "string" }, + status: {type: ["string", "null"]} + }, + required: ["step_number", "step_instructions", "status"], + additionalProperties: false + }, + strict: true + } + }, + required: ["steps"], + additionalProperties: false +} as const; + + export function convertTestCaseToSteps(testCase: TestCase): string { if (!testCase.steps || !Array.isArray(testCase.steps)) { throw new Error("Invalid test case format: missing steps array"); diff --git a/cua-server/src/utils/test-script-utils.ts b/cua-server/src/utils/test-script-utils.ts new file mode 100644 index 0000000..79aafae --- /dev/null +++ b/cua-server/src/utils/test-script-utils.ts @@ -0,0 +1,78 @@ +// Utilities for test script review functionality +import logger from "./logger"; + +export interface TestScriptStep { + step_number: number; + status: "pending" | "Pass" | "Fail"; + step_reasoning: string; + image_path?: string; +} + +export interface TestScriptState { + steps: TestScriptStep[]; +} + +// Test script JSON schema for structured output +export const TEST_SCRIPT_AGENT_JSON_SCHEMA = { + type: "object", + properties: { + steps: { + type: "array", + items: { + type: "object", + properties: { + step_number: { type: "number" }, + status: { + type: "string", + enum: ["pending", "Pass", "Fail"] + }, + step_reasoning: { type: "string" } + }, + required: ["step_number", "status", "step_reasoning"], + additionalProperties: false + } + } + }, + required: ["steps"], + additionalProperties: false + } as const; + +/** + * Determines which steps had status changes from pending to Pass/Fail. + */ +export function getStepsWithStatusChange(oldSteps: TestScriptStep[], newSteps: TestScriptStep[]): Set { + const changedSteps = new Set(); + + logger.trace(`Old Steps:\n${JSON.stringify(oldSteps, null, 2)}`); + logger.trace(`New Steps:\n${JSON.stringify(newSteps, null, 2)}`); + + oldSteps.forEach(oldStep => { + const newStep = newSteps.find(s => s.step_number === oldStep.step_number); + if (newStep?.status !== "pending" && oldStep.status === "pending") { + changedSteps.add(oldStep.step_number); + } + }); + + logger.debug(`Status Changes Detected:\n${JSON.stringify({ changedSteps: Array.from(changedSteps) }, null, 2)}`); + return changedSteps; +} + +/** + * Updates step image paths based on status changes and existing paths. + */ +export function updateStepImagePaths( + oldSteps: TestScriptStep[], + newSteps: TestScriptStep[], + changedSteps: Set, + screenshotPath?: string +): void { + newSteps.forEach(newStep => { + const oldStep = oldSteps.find(s => s.step_number === newStep.step_number); + + if (changedSteps.has(newStep.step_number) && screenshotPath) { + newStep.image_path = screenshotPath; + } else if (oldStep?.image_path) { + newStep.image_path = oldStep.image_path; + } + }); +} \ No newline at end of file diff --git a/frontend/.env.example b/frontend/.env.example index 38aa0db..5d33ada 100644 --- a/frontend/.env.example +++ b/frontend/.env.example @@ -1,4 +1,3 @@ -OPENAI_API_KEY=your-openai-key # Optional: Set the display size for the browser. DISPLAY_WIDTH=1024 DISPLAY_HEIGHT=768