Skip to content

Commit a3938e8

Browse files
committed
feat: add look_at tool and multimodal-looker agent
Add a new tool and agent for analyzing media files (PDFs, images, diagrams) that require visual interpretation beyond raw text. - Add `multimodal-looker` agent using Gemini 2.5 Flash model - Add `look_at` tool that spawns multimodal-looker sessions - Restrict multimodal-looker from calling task/call_omo_agent/look_at tools Inspired by Sourcegraph Ampcode's look_at tool design. 🤖 GENERATED WITH ASSISTANCE OF [OhMyOpenCode](https://github.com/code-yeongyu/oh-my-opencode)
1 parent 821b0b8 commit a3938e8

File tree

10 files changed

+180
-1
lines changed

10 files changed

+180
-1
lines changed

src/agents/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@ import { librarianAgent } from "./librarian"
44
import { exploreAgent } from "./explore"
55
import { frontendUiUxEngineerAgent } from "./frontend-ui-ux-engineer"
66
import { documentWriterAgent } from "./document-writer"
7+
import { multimodalLookerAgent } from "./multimodal-looker"
78

89
export const builtinAgents: Record<string, AgentConfig> = {
910
oracle: oracleAgent,
1011
librarian: librarianAgent,
1112
explore: exploreAgent,
1213
"frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
1314
"document-writer": documentWriterAgent,
15+
"multimodal-looker": multimodalLookerAgent,
1416
}
1517

1618
export * from "./types"

src/agents/multimodal-looker.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import type { AgentConfig } from "@opencode-ai/sdk"
2+
3+
export const multimodalLookerAgent: AgentConfig = {
4+
description:
5+
"Analyze media files (PDFs, images, diagrams) that require interpretation beyond raw text. Extracts specific information or summaries from documents, describes visual content. Use when you need analyzed/extracted data rather than literal file contents.",
6+
mode: "subagent",
7+
model: "google/gemini-2.5-flash",
8+
temperature: 0.1,
9+
tools: { Read: true },
10+
prompt: `You interpret media files that cannot be read as plain text.
11+
12+
Your job: examine the attached file and extract ONLY what was requested.
13+
14+
When to use you:
15+
- Media files the Read tool cannot interpret
16+
- Extracting specific information or summaries from documents
17+
- Describing visual content in images or diagrams
18+
- When analyzed/extracted data is needed, not raw file contents
19+
20+
When NOT to use you:
21+
- Source code or plain text files needing exact contents (use Read)
22+
- Files that need editing afterward (need literal content from Read)
23+
- Simple file reading where no interpretation is needed
24+
25+
How you work:
26+
1. Receive a file path and a goal describing what to extract
27+
2. Read and analyze the file deeply
28+
3. Return ONLY the relevant extracted information
29+
4. The main agent never processes the raw file - you save context tokens
30+
31+
For PDFs: extract text, structure, tables, data from specific sections
32+
For images: describe layouts, UI elements, text, diagrams, charts
33+
For diagrams: explain relationships, flows, architecture depicted
34+
35+
Response rules:
36+
- Return extracted information directly, no preamble
37+
- If info not found, state clearly what's missing
38+
- Match the language of the request
39+
- Be thorough on the goal, concise on everything else
40+
41+
Your output goes straight to the main agent for continued work.`,
42+
}

src/agents/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ export type AgentName =
66
| "explore"
77
| "frontend-ui-ux-engineer"
88
| "document-writer"
9+
| "multimodal-looker"
910

1011
export type AgentOverrideConfig = Partial<AgentConfig>
1112

src/agents/utils.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { librarianAgent } from "./librarian"
55
import { exploreAgent } from "./explore"
66
import { frontendUiUxEngineerAgent } from "./frontend-ui-ux-engineer"
77
import { documentWriterAgent } from "./document-writer"
8+
import { multimodalLookerAgent } from "./multimodal-looker"
89
import { deepMerge } from "../shared"
910

1011
const allBuiltinAgents: Record<AgentName, AgentConfig> = {
@@ -13,6 +14,7 @@ const allBuiltinAgents: Record<AgentName, AgentConfig> = {
1314
explore: exploreAgent,
1415
"frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
1516
"document-writer": documentWriterAgent,
17+
"multimodal-looker": multimodalLookerAgent,
1618
}
1719

1820
function mergeAgentConfig(

src/index.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ import {
4141
getCurrentSessionTitle,
4242
} from "./features/claude-code-session-state";
4343
import { updateTerminalTitle } from "./features/terminal";
44-
import { builtinTools, createCallOmoAgent, createBackgroundTools } from "./tools";
44+
import { builtinTools, createCallOmoAgent, createBackgroundTools, createLookAt } from "./tools";
4545
import { BackgroundManager } from "./features/background-agent";
4646
import { createBuiltinMcps } from "./mcp";
4747
import { OhMyOpenCodeConfigSchema, type OhMyOpenCodeConfig, type HookName } from "./config";
@@ -218,6 +218,7 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => {
218218
const backgroundTools = createBackgroundTools(backgroundManager, ctx.client);
219219

220220
const callOmoAgent = createCallOmoAgent(ctx, backgroundManager);
221+
const lookAt = createLookAt(ctx);
221222

222223
const googleAuthHooks = pluginConfig.google_auth
223224
? await createGoogleAntigravityAuthPlugin(ctx)
@@ -230,6 +231,7 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => {
230231
...builtinTools,
231232
...backgroundTools,
232233
call_omo_agent: callOmoAgent,
234+
look_at: lookAt,
233235
},
234236

235237
"chat.message": async (input, output) => {
@@ -268,6 +270,14 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => {
268270
call_omo_agent: false,
269271
};
270272
}
273+
if (config.agent["multimodal-looker"]) {
274+
config.agent["multimodal-looker"].tools = {
275+
...config.agent["multimodal-looker"].tools,
276+
task: false,
277+
call_omo_agent: false,
278+
look_at: false,
279+
};
280+
}
271281

272282
const mcpResult = (pluginConfig.claude_code?.mcp ?? true)
273283
? await loadMcpConfigs()

src/tools/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import type { BackgroundManager } from "../features/background-agent"
3434
type OpencodeClient = PluginInput["client"]
3535

3636
export { createCallOmoAgent } from "./call-omo-agent"
37+
export { createLookAt } from "./look-at"
3738

3839
export function createBackgroundTools(manager: BackgroundManager, client: OpencodeClient) {
3940
return {

src/tools/look-at/constants.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
export const MULTIMODAL_LOOKER_AGENT = "multimodal-looker" as const
2+
3+
export const LOOK_AT_DESCRIPTION = `Analyze media files (PDFs, images, diagrams) that require visual interpretation.
4+
5+
Use this tool to extract specific information from files that cannot be processed as plain text:
6+
- PDF documents: extract text, tables, structure, specific sections
7+
- Images: describe layouts, UI elements, text content, diagrams
8+
- Charts/Graphs: explain data, trends, relationships
9+
- Screenshots: identify UI components, text, visual elements
10+
- Architecture diagrams: explain flows, connections, components
11+
12+
Parameters:
13+
- file_path: Absolute path to the file to analyze
14+
- goal: What specific information to extract (be specific for better results)
15+
16+
Examples:
17+
- "Extract all API endpoints from this OpenAPI spec PDF"
18+
- "Describe the UI layout and components in this screenshot"
19+
- "Explain the data flow in this architecture diagram"
20+
- "List all table data from page 3 of this PDF"
21+
22+
This tool uses a separate context window with Gemini 2.5 Flash for multimodal analysis,
23+
saving tokens in the main conversation while providing accurate visual interpretation.`

src/tools/look-at/index.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
export * from "./types"
2+
export * from "./constants"
3+
export { createLookAt } from "./tools"

src/tools/look-at/tools.ts

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import { tool, type PluginInput } from "@opencode-ai/plugin"
2+
import { LOOK_AT_DESCRIPTION, MULTIMODAL_LOOKER_AGENT } from "./constants"
3+
import type { LookAtArgs } from "./types"
4+
import { log } from "../../shared/logger"
5+
6+
export function createLookAt(ctx: PluginInput) {
7+
return tool({
8+
description: LOOK_AT_DESCRIPTION,
9+
args: {
10+
file_path: tool.schema.string().describe("Absolute path to the file to analyze"),
11+
goal: tool.schema.string().describe("What specific information to extract from the file"),
12+
},
13+
async execute(args: LookAtArgs, toolContext) {
14+
log(`[look_at] Analyzing file: ${args.file_path}, goal: ${args.goal}`)
15+
16+
const prompt = `Analyze this file and extract the requested information.
17+
18+
File path: ${args.file_path}
19+
Goal: ${args.goal}
20+
21+
Read the file using the Read tool, then provide ONLY the extracted information that matches the goal.
22+
Be thorough on what was requested, concise on everything else.
23+
If the requested information is not found, clearly state what is missing.`
24+
25+
log(`[look_at] Creating session with parent: ${toolContext.sessionID}`)
26+
const createResult = await ctx.client.session.create({
27+
body: {
28+
parentID: toolContext.sessionID,
29+
title: `look_at: ${args.goal.substring(0, 50)}`,
30+
},
31+
})
32+
33+
if (createResult.error) {
34+
log(`[look_at] Session create error:`, createResult.error)
35+
return `Error: Failed to create session: ${createResult.error}`
36+
}
37+
38+
const sessionID = createResult.data.id
39+
log(`[look_at] Created session: ${sessionID}`)
40+
41+
log(`[look_at] Sending prompt to session ${sessionID}`)
42+
await ctx.client.session.prompt({
43+
path: { id: sessionID },
44+
body: {
45+
agent: MULTIMODAL_LOOKER_AGENT,
46+
tools: {
47+
task: false,
48+
call_omo_agent: false,
49+
look_at: false,
50+
},
51+
parts: [{ type: "text", text: prompt }],
52+
},
53+
})
54+
55+
log(`[look_at] Prompt sent, fetching messages...`)
56+
57+
const messagesResult = await ctx.client.session.messages({
58+
path: { id: sessionID },
59+
})
60+
61+
if (messagesResult.error) {
62+
log(`[look_at] Messages error:`, messagesResult.error)
63+
return `Error: Failed to get messages: ${messagesResult.error}`
64+
}
65+
66+
const messages = messagesResult.data
67+
log(`[look_at] Got ${messages.length} messages`)
68+
69+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
70+
const lastAssistantMessage = messages
71+
.filter((m: any) => m.info.role === "assistant")
72+
.sort((a: any, b: any) => (b.info.time?.created || 0) - (a.info.time?.created || 0))[0]
73+
74+
if (!lastAssistantMessage) {
75+
log(`[look_at] No assistant message found`)
76+
return `Error: No response from multimodal-looker agent`
77+
}
78+
79+
log(`[look_at] Found assistant message with ${lastAssistantMessage.parts.length} parts`)
80+
81+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
82+
const textParts = lastAssistantMessage.parts.filter((p: any) => p.type === "text")
83+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
84+
const responseText = textParts.map((p: any) => p.text).join("\n")
85+
86+
log(`[look_at] Got response, length: ${responseText.length}`)
87+
88+
return responseText
89+
},
90+
})
91+
}

src/tools/look-at/types.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
export interface LookAtArgs {
2+
file_path: string
3+
goal: string
4+
}

0 commit comments

Comments
 (0)