Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions packages/magnitude-core/baml_src/planner.baml
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,30 @@ function QueryMemory(memory: AgentContext, query: string, includeClaudeSpoof: bo
// {{ DescribeBrowserExecutionContext(context) }}
// "#
// }

class GroundedActionList {
@@dynamic
}

function GroundActions(screenshot: image, actions: string) -> GroundedActionList {
client GeminiPro
prompt #"
{{ _.role("system") }}
You are a helpful assistant that grounds actions to a screenshot.

You will receive a list of actions that need to be executed. Some of these actions may be missing specific coordinates (e.g. they may have generic coordinates like 0,0) but describe what they intend to click/scroll.

Your job is to:
1. Analyze the screenshot to find the correct elements.
2. Update the actions with the precise x,y coordinates based on the screenshot.
3. Return the list of grounded actions.

{{ ctx.output_format }}

{{ _.role("user") }}
{{ screenshot }}

Actions to ground:
{{ actions }}
"#
}
43 changes: 41 additions & 2 deletions packages/magnitude-core/src/agent/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import { isClaude } from '@/ai/util';
import { retryOnError } from '@/common';
import { renderContentParts } from '@/memory/rendering';
import { MultiModelHarness } from '@/ai/multiModelHarness';
import { BrowserConnector } from '@/connectors/browserConnector';


export interface AgentOptions {
Expand Down Expand Up @@ -167,8 +168,10 @@ export class Agent {
const actionDefinition = this.actions.find(def => def.name === action.variant);

if (!actionDefinition) {
// It's possible the action name was from a connector that is no longer active,
// or the action space was not correctly aggregated.
// throw new AgentError(`Undefined action type '${action.variant}'. Ensure agent is configured with appropriate action definitions from connectors.`);
if (action.variant === undefined) {
throw new AgentError(`Undefined action type 'undefined'. This usually means the grounding step failed to produce a valid action structure. Ensure the grounding model is returning actions with a 'variant' property matching the available actions.`);
}
throw new AgentError(`Undefined action type '${action.variant}'. Ensure agent is configured with appropriate action definitions from connectors.`);
}
return actionDefinition;
Expand Down Expand Up @@ -378,6 +381,42 @@ export class Agent {
`Error planning actions: ${(error as Error).message}`, { variant: 'misalignment' }
)
}

// Ground actions if necessary
// We assume that if we have a browser connector, we might want to ground mouse/scroll actions
const browserConnector = this.getConnector(BrowserConnector);
if (browserConnector) {
const screenshot = await browserConnector.getLastScreenshot();

// Filter actions that need grounding (mouse interactions)
// Check against the list of actions we know are spatial
const spatialPrefixes = ['mouse:'];
const actionsToGround = actions.filter(a => spatialPrefixes.some(prefix => a.variant.startsWith(prefix)));

if (actionsToGround.length > 0) {
// Define vocabulary for grounding (only mouse actions)
const groundingVocabulary = this.actions.filter(a => spatialPrefixes.some(prefix => a.name.startsWith(prefix)));

try {
// Ground the subset of actions
const groundedSubset = await this.models.ground(screenshot, actionsToGround, groundingVocabulary);

// Merge back
let groundedIndex = 0;
actions = actions.map(a => {
if (spatialPrefixes.some(prefix => a.variant.startsWith(prefix))) {
const grounded = groundedSubset[groundedIndex];
groundedIndex++;
return grounded || a; // Fallback to original if missing/dropped
}
return a;
});
} catch (groundingError) {
logger.warn(`Grounding failed: ${groundingError instanceof Error ? groundingError.message : String(groundingError)}. Proceeding with ungrounded actions.`);
// Proceed with original actions if grounding fails
}
}
}

logger.info({ reasoning, actions }, `Partial recipe created`);

Expand Down
22 changes: 22 additions & 0 deletions packages/magnitude-core/src/ai/modelHarness.ts
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,28 @@ export class ModelHarness {
return resp.data;
}
}

async ground<T>(screenshot: Image, actions: Action[], actionVocabulary: ActionDefinition<T>[]): Promise<Action[]> {
const tb = new TypeBuilder();

// Use the same dynamic construction for the list of grounded actions
// We want BAML to return an object with an 'actions' property which is a list of concrete actions
// This mirrors PartialRecipe but just for the actions part.
// Actually, we should just construct 'GroundedActionList' to have a property 'actions' which is the list.
tb.GroundedActionList.addProperty('actions', tb.list(convertActionDefinitionsToBaml(tb, actionVocabulary)));

const jsonActions = JSON.stringify(actions, null, 2);

const response = await this.baml.GroundActions(
await screenshot.toBaml(),
jsonActions,
{ tb }
);
this._reportUsage();

// Return the strictly typed actions from the dynamic response
return response.actions as Action[];
}

// async classifyCheckFailure(screenshot: Image, check: string, existingRecipe: Action[], tabState: TabState): Promise<BugDetectedFailure | MisalignmentFailure> {
// const stringifiedExistingRecipe = [];
Expand Down
4 changes: 4 additions & 0 deletions packages/magnitude-core/src/ai/multiModelHarness.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ export class MultiModelHarness {
return await this.roles['query'].query(context, query, schema);
}

async ground<T>(screenshot: Image, actions: Action[], actionVocabulary: ActionDefinition<T>[]): Promise<Action[]> {
return await this.roles['ground'].ground(screenshot, actions, actionVocabulary);
}

get numUniqueModels() {
return this.uniqueModels.length;
}
Expand Down
4 changes: 2 additions & 2 deletions packages/magnitude-core/src/ai/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
// confidence: number
// }

export type BrowserAgentRole= 'act' | 'extract' | 'query';
export const allBrowserAgentRoles: BrowserAgentRole[] = ['act', 'extract', 'query'] as const;
export type BrowserAgentRole= 'act' | 'extract' | 'query' | 'ground';
export const allBrowserAgentRoles: BrowserAgentRole[] = ['act', 'extract', 'query', 'ground'] as const;

// Approximately mirrors https://docs.boundaryml.com/ref/llm-client-providers
export type LLMClient = (AnthropicClient | ClaudeCodeClient | BedrockClient | GoogleAIClient | GoogleVertexClient | OpenAIClient | OpenAIGenericClient | AzureOpenAIClient) &
Expand Down