Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 53 additions & 2 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,53 @@ import {
structureContinuePrompt,
retrievalPrompt,
answerPrompt,
imageDescriptionPrompt,
} from "./prompts.js";
import { countTokens } from "./pdf-parser.js";
import type { Page, TreeNode, IndexData, Stats } from "./types.js";
import type { BaseLLM } from "./llm-backends.js";

/** Append image descriptions to page text, modifying pages in place. */
async function describeImages(
pages: Page[],
llm?: BaseLLM | null,
verbose: boolean = false,
): Promise<void> {
for (const page of pages) {
if (!page.images || page.images.length === 0) continue;

const descriptions: string[] = [];
for (const img of page.images) {
const alt = (img.alt_text ?? "").trim();
if (alt) {
descriptions.push(`[Image: ${alt}]`);
} else if (llm?.supportsVision && img.data) {
try {
const desc = await llm.generateWithImage(
imageDescriptionPrompt(),
img.data,
img.mime_type,
);
descriptions.push(`[Image: ${desc.trim()}]`);
} catch {
descriptions.push("[Image present]");
}
} else {
descriptions.push("[Image present]");
}
}

if (descriptions.length > 0) {
page.text = page.text + "\n" + descriptions.join("\n");
page.token_count = countTokens(page.text);
}

if (verbose && descriptions.length > 0) {
console.log(` Page ${page.page_num}: ${descriptions.length} image(s) described`);
}
}
}

/** Result of a TreeDex query. */
export class QueryResult {
readonly context: string;
Expand Down Expand Up @@ -100,13 +143,15 @@ export class TreeDex {
maxTokens?: number;
overlap?: number;
verbose?: boolean;
extractImages?: boolean;
},
): Promise<TreeDex> {
const {
loader,
maxTokens = 20000,
overlap = 1,
verbose = true,
extractImages = false,
} = options ?? {};

if (verbose) {
Expand All @@ -118,7 +163,7 @@ export class TreeDex {
if (loader) {
pages = await loader.load(path);
} else {
pages = await autoLoader(path);
pages = await autoLoader(path, { extractImages });
}

if (verbose) {
Expand All @@ -141,6 +186,9 @@ export class TreeDex {
): Promise<TreeDex> {
const { maxTokens = 20000, overlap = 1, verbose = true } = options ?? {};

// Describe images before grouping — appends text markers to pages
await describeImages(pages, llm, verbose);

const groups = groupPages(pages, maxTokens, overlap);

if (verbose) {
Expand Down Expand Up @@ -288,11 +336,14 @@ export class TreeDex {
const fs = await import("node:fs/promises");
const stripped = stripTextFromTree(this.tree);

// Strip images from pages — descriptions are already in text
const cleanPages: Page[] = this.pages.map(({ images: _images, ...rest }) => rest);

const data: IndexData = {
version: "1.0",
framework: "TreeDex",
tree: stripped,
pages: this.pages,
pages: cleanPages,
};

await fs.writeFile(path, JSON.stringify(data, null, 2), "utf-8");
Expand Down
3 changes: 2 additions & 1 deletion src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,6 @@ export {
structureContinuePrompt,
retrievalPrompt,
answerPrompt,
imageDescriptionPrompt,
} from "./prompts.js";
export type { Page, TreeNode, IndexData, Stats } from "./types.js";
export type { Page, PageImage, TreeNode, IndexData, Stats } from "./types.js";
106 changes: 105 additions & 1 deletion src/llm-backends.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,22 @@
export abstract class BaseLLM {
abstract generate(prompt: string): Promise<string>;

/** Whether this backend supports image inputs. */
get supportsVision(): boolean {
return false;
}

/** Send a prompt with an image and return the generated text. */
async generateWithImage(
_prompt: string,
_imageBase64: string,
_mimeType: string,
): Promise<string> {
throw new Error(
`${this.constructor.name} does not support vision/image inputs.`,
);
}

toString(): string {
return `${this.constructor.name}()`;
}
Expand Down Expand Up @@ -65,11 +81,28 @@ export class GeminiLLM extends BaseLLM {
}

async generate(prompt: string): Promise<string> {
const model = await this.getClient() as { generateContent(p: string): Promise<{ response: { text(): string } }> };
const model = await this.getClient() as { generateContent(p: unknown): Promise<{ response: { text(): string } }> };
const response = await model.generateContent(prompt);
return response.response.text();
}

get supportsVision(): boolean {
return true;
}
Comment on lines +89 to +91
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, let's examine the src/llm-backends.ts file to understand the structure and find the relevant lines
wc -l src/llm-backends.ts

Repository: mithun50/TreeDex

Length of output: 83


🏁 Script executed:

# Read the entire file to understand the implementation
cat -n src/llm-backends.ts

Repository: mithun50/TreeDex

Length of output: 25884


🏁 Script executed:

# Also check src/core.ts to understand how supportsVision is used
cat -n src/core.ts

Repository: mithun50/TreeDex

Length of output: 13819


supportsVision should be model-aware, not hard-coded by backend class.

These getters always return true, even when callers override modelName with a text-only model. While generateWithImage() errors are caught with graceful fallback in src/core.ts (line 55), this causes unnecessary API calls and errors for text-only model configurations. Please make this configurable per instance or derive it from the chosen model to avoid wasted requests.

Also applies to: 149-151, 225-227

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/llm-backends.ts` around lines 89 - 91, The supportsVision getter is
hard-coded to return true causing wasted image requests; change it to be
model-aware by making supportsVision read from instance state (e.g.,
this.modelName or this.opts.supportsVision) or derive it from a helper
isVisionModel(modelName). Update the getters at the three locations (the get
supportsVision() implementations around lines 89-91, 149-151, 225-227) to return
a boolean based on either an explicit constructor option (supportsVision) or on
a small isVisionModel(this.modelName) lookup, and add that option/lookup where
the backend classes are constructed so callers can override or the backend can
compute capability from the model name.


async generateWithImage(
prompt: string,
imageBase64: string,
mimeType: string,
): Promise<string> {
const model = await this.getClient() as { generateContent(p: unknown): Promise<{ response: { text(): string } }> };
const imagePart = {
inlineData: { mimeType, data: imageBase64 },
};
const response = await model.generateContent([prompt, imagePart]);
return response.response.text();
}

toString(): string {
return `GeminiLLM(model=${JSON.stringify(this.modelName)})`;
}
Expand Down Expand Up @@ -113,6 +146,40 @@ export class OpenAILLM extends BaseLLM {
return response.choices[0].message.content;
}

get supportsVision(): boolean {
return true;
}

async generateWithImage(
prompt: string,
imageBase64: string,
mimeType: string,
): Promise<string> {
const client = await this.getClient() as {
chat: {
completions: {
create(opts: unknown): Promise<{
choices: Array<{ message: { content: string } }>;
}>;
};
};
};
const response = await client.chat.completions.create({
model: this.modelName,
messages: [{
role: "user",
content: [
{ type: "text", text: prompt },
{
type: "image_url",
image_url: { url: `data:${mimeType};base64,${imageBase64}` },
},
],
}],
});
return response.choices[0].message.content;
}

toString(): string {
return `OpenAILLM(model=${JSON.stringify(this.modelName)})`;
}
Expand Down Expand Up @@ -155,6 +222,43 @@ export class ClaudeLLM extends BaseLLM {
return response.content[0].text;
}

get supportsVision(): boolean {
return true;
}

async generateWithImage(
prompt: string,
imageBase64: string,
mimeType: string,
): Promise<string> {
const client = await this.getClient() as {
messages: {
create(opts: unknown): Promise<{
content: Array<{ text: string }>;
}>;
};
};
const response = await client.messages.create({
model: this.modelName,
max_tokens: 4096,
messages: [{
role: "user",
content: [
{
type: "image",
source: {
type: "base64",
media_type: mimeType,
data: imageBase64,
},
},
{ type: "text", text: prompt },
],
}],
});
return response.content[0].text;
}

toString(): string {
return `ClaudeLLM(model=${JSON.stringify(this.modelName)})`;
}
Expand Down
49 changes: 42 additions & 7 deletions src/loaders.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,15 @@ export function textToPages(

/** Load PDF files using pdfjs-dist. */
export class PDFLoader {
readonly extractImages: boolean;

constructor(options?: { extractImages?: boolean }) {
this.extractImages = options?.extractImages ?? false;
}

async load(path: string): Promise<Page[]> {
const { extractPages } = await import("./pdf-parser.js");
return extractPages(path);
return extractPages(path, { extractImages: this.extractImages });
}
}

Expand Down Expand Up @@ -73,8 +79,16 @@ export class HTMLLoader {
let skip = false;

const parser = new Parser({
onopentag(name: string) {
onopentag(name: string, attribs: Record<string, string>) {
if (name === "script" || name === "style") skip = true;
if (name === "img") {
const alt = (attribs.alt || "").trim();
if (alt) {
parts.push(`\n[Image: ${alt}]\n`);
} else {
parts.push("\n[Image]\n");
}
}
},
onclosetag(name: string) {
if (name === "script" || name === "style") skip = false;
Expand All @@ -98,9 +112,16 @@ export class HTMLLoader {
});
} catch {
// Fallback: simple regex-based tag stripping
return html
// Extract img alt text before stripping all tags
let processed = html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "");
processed = processed.replace(/<img\b[^>]*>/gi, (tag) => {
const altMatch = tag.match(/alt=["']([^"']*)["']/i);
const alt = altMatch ? altMatch[1].trim() : "";
return alt ? ` [Image: ${alt}] ` : " [Image] ";
});
return processed
.replace(/<[^>]+>/g, " ")
.replace(/\s+/g, " ")
.trim();
Expand All @@ -121,8 +142,16 @@ export class DOCXLoader {
// @ts-expect-error -- optional peer dependency
const mammoth = await import("mammoth");
const buffer = await fs.readFile(path);
const result = await mammoth.extractRawText({ buffer });
return textToPages(result.value, this.charsPerPage);
const result = await mammoth.convertToHtml({ buffer });
// Replace <img> tags with [Image: alt] markers, then strip remaining HTML
let html: string = result.value;
html = html.replace(/<img\b[^>]*>/gi, (tag: string) => {
const altMatch = tag.match(/alt=["']([^"']*)["']/i);
const alt = altMatch ? altMatch[1].trim() : "";
return alt ? `[Image: ${alt}]` : "[Image]";
});
const text = html.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
return textToPages(text, this.charsPerPage);
}
}

Expand All @@ -140,7 +169,10 @@ const EXTENSION_MAP: Record<string, { new (): Loader }> = {
};

/** Auto-detect file format and load pages. */
export async function autoLoader(filePath: string): Promise<Page[]> {
export async function autoLoader(
filePath: string,
options?: { extractImages?: boolean },
): Promise<Page[]> {
const { extname } = await import("node:path");
const ext = extname(filePath).toLowerCase();
const LoaderClass = EXTENSION_MAP[ext];
Expand All @@ -150,6 +182,9 @@ export async function autoLoader(filePath: string): Promise<Page[]> {
`Unsupported file extension '${ext}'. Supported: ${supported}`,
);
}
if (ext === ".pdf" && options?.extractImages) {
return new PDFLoader({ extractImages: true }).load(filePath);
}
const loader = new LoaderClass();
return loader.load(filePath);
}
Loading
Loading