mithun50 · mithun50 · Mar 22, 2026 · Mar 20, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/src/core.ts b/src/core.ts
@@ -23,10 +23,53 @@ import {
   structureContinuePrompt,
   retrievalPrompt,
   answerPrompt,
+  imageDescriptionPrompt,
 } from "./prompts.js";
+import { countTokens } from "./pdf-parser.js";
 import type { Page, TreeNode, IndexData, Stats } from "./types.js";
 import type { BaseLLM } from "./llm-backends.js";
 
+/** Append image descriptions to page text, modifying pages in place. */
+async function describeImages(
+  pages: Page[],
+  llm?: BaseLLM | null,
+  verbose: boolean = false,
+): Promise<void> {
+  for (const page of pages) {
+    if (!page.images || page.images.length === 0) continue;
+
+    const descriptions: string[] = [];
+    for (const img of page.images) {
+      const alt = (img.alt_text ?? "").trim();
+      if (alt) {
+        descriptions.push(`[Image: ${alt}]`);
+      } else if (llm?.supportsVision && img.data) {
+        try {
+          const desc = await llm.generateWithImage(
+            imageDescriptionPrompt(),
+            img.data,
+            img.mime_type,
+          );
+          descriptions.push(`[Image: ${desc.trim()}]`);
+        } catch {
+          descriptions.push("[Image present]");
+        }
+      } else {
+        descriptions.push("[Image present]");
+      }
+    }
+
+    if (descriptions.length > 0) {
+      page.text = page.text + "\n" + descriptions.join("\n");
+      page.token_count = countTokens(page.text);
+    }
+
+    if (verbose && descriptions.length > 0) {
+      console.log(`  Page ${page.page_num}: ${descriptions.length} image(s) described`);
+    }
+  }
+}
+
 /** Result of a TreeDex query. */
 export class QueryResult {
   readonly context: string;
@@ -100,13 +143,15 @@ export class TreeDex {
       maxTokens?: number;
       overlap?: number;
       verbose?: boolean;
+      extractImages?: boolean;
     },
   ): Promise<TreeDex> {
     const {
       loader,
       maxTokens = 20000,
       overlap = 1,
       verbose = true,
+      extractImages = false,
     } = options ?? {};
 
     if (verbose) {
@@ -118,7 +163,7 @@ export class TreeDex {
     if (loader) {
       pages = await loader.load(path);
     } else {
-      pages = await autoLoader(path);
+      pages = await autoLoader(path, { extractImages });
     }
 
     if (verbose) {
@@ -141,6 +186,9 @@ export class TreeDex {
   ): Promise<TreeDex> {
     const { maxTokens = 20000, overlap = 1, verbose = true } = options ?? {};
 
+    // Describe images before grouping — appends text markers to pages
+    await describeImages(pages, llm, verbose);
+
     const groups = groupPages(pages, maxTokens, overlap);
 
     if (verbose) {
@@ -288,11 +336,14 @@ export class TreeDex {
     const fs = await import("node:fs/promises");
     const stripped = stripTextFromTree(this.tree);
 
+    // Strip images from pages — descriptions are already in text
+    const cleanPages: Page[] = this.pages.map(({ images: _images, ...rest }) => rest);
+
     const data: IndexData = {
       version: "1.0",
       framework: "TreeDex",
       tree: stripped,
-      pages: this.pages,
+      pages: cleanPages,
     };
 
     await fs.writeFile(path, JSON.stringify(data, null, 2), "utf-8");

diff --git a/src/index.ts b/src/index.ts
@@ -56,5 +56,6 @@ export {
   structureContinuePrompt,
   retrievalPrompt,
   answerPrompt,
+  imageDescriptionPrompt,
 } from "./prompts.js";
-export type { Page, TreeNode, IndexData, Stats } from "./types.js";
+export type { Page, PageImage, TreeNode, IndexData, Stats } from "./types.js";
diff --git a/src/llm-backends.ts b/src/llm-backends.ts
@@ -33,6 +33,22 @@
 export abstract class BaseLLM {
   abstract generate(prompt: string): Promise<string>;
 
+  /** Whether this backend supports image inputs. */
+  get supportsVision(): boolean {
+    return false;
+  }
+
+  /** Send a prompt with an image and return the generated text. */
+  async generateWithImage(
+    _prompt: string,
+    _imageBase64: string,
+    _mimeType: string,
+  ): Promise<string> {
+    throw new Error(
+      `${this.constructor.name} does not support vision/image inputs.`,
+    );
+  }
+
   toString(): string {
     return `${this.constructor.name}()`;
   }
@@ -65,11 +81,28 @@ export class GeminiLLM extends BaseLLM {
   }
 
   async generate(prompt: string): Promise<string> {
-    const model = await this.getClient() as { generateContent(p: string): Promise<{ response: { text(): string } }> };
+    const model = await this.getClient() as { generateContent(p: unknown): Promise<{ response: { text(): string } }> };
     const response = await model.generateContent(prompt);
     return response.response.text();
   }
 
+  get supportsVision(): boolean {
+    return true;
+  }
+
+  async generateWithImage(
+    prompt: string,
+    imageBase64: string,
+    mimeType: string,
+  ): Promise<string> {
+    const model = await this.getClient() as { generateContent(p: unknown): Promise<{ response: { text(): string } }> };
+    const imagePart = {
+      inlineData: { mimeType, data: imageBase64 },
+    };
+    const response = await model.generateContent([prompt, imagePart]);
+    return response.response.text();
+  }
+
   toString(): string {
     return `GeminiLLM(model=${JSON.stringify(this.modelName)})`;
   }
@@ -113,6 +146,40 @@ export class OpenAILLM extends BaseLLM {
     return response.choices[0].message.content;
   }
 
+  get supportsVision(): boolean {
+    return true;
+  }
+
+  async generateWithImage(
+    prompt: string,
+    imageBase64: string,
+    mimeType: string,
+  ): Promise<string> {
+    const client = await this.getClient() as {
+      chat: {
+        completions: {
+          create(opts: unknown): Promise<{
+            choices: Array<{ message: { content: string } }>;
+          }>;
+        };
+      };
+    };
+    const response = await client.chat.completions.create({
+      model: this.modelName,
+      messages: [{
+        role: "user",
+        content: [
+          { type: "text", text: prompt },
+          {
+            type: "image_url",
+            image_url: { url: `data:${mimeType};base64,${imageBase64}` },
+          },
+        ],
+      }],
+    });
+    return response.choices[0].message.content;
+  }
+
   toString(): string {
     return `OpenAILLM(model=${JSON.stringify(this.modelName)})`;
   }
@@ -155,6 +222,43 @@ export class ClaudeLLM extends BaseLLM {
     return response.content[0].text;
   }
 
+  get supportsVision(): boolean {
+    return true;
+  }
+
+  async generateWithImage(
+    prompt: string,
+    imageBase64: string,
+    mimeType: string,
+  ): Promise<string> {
+    const client = await this.getClient() as {
+      messages: {
+        create(opts: unknown): Promise<{
+          content: Array<{ text: string }>;
+        }>;
+      };
+    };
+    const response = await client.messages.create({
+      model: this.modelName,
+      max_tokens: 4096,
+      messages: [{
+        role: "user",
+        content: [
+          {
+            type: "image",
+            source: {
+              type: "base64",
+              media_type: mimeType,
+              data: imageBase64,
+            },
+          },
+          { type: "text", text: prompt },
+        ],
+      }],
+    });
+    return response.content[0].text;
+  }
+
   toString(): string {
     return `ClaudeLLM(model=${JSON.stringify(this.modelName)})`;
   }

diff --git a/src/loaders.ts b/src/loaders.ts
@@ -27,9 +27,15 @@ export function textToPages(
 
 /** Load PDF files using pdfjs-dist. */
 export class PDFLoader {
+  readonly extractImages: boolean;
+
+  constructor(options?: { extractImages?: boolean }) {
+    this.extractImages = options?.extractImages ?? false;
+  }
+
   async load(path: string): Promise<Page[]> {
     const { extractPages } = await import("./pdf-parser.js");
-    return extractPages(path);
+    return extractPages(path, { extractImages: this.extractImages });
   }
 }
 
@@ -73,8 +79,16 @@ export class HTMLLoader {
         let skip = false;
 
         const parser = new Parser({
-          onopentag(name: string) {
+          onopentag(name: string, attribs: Record<string, string>) {
             if (name === "script" || name === "style") skip = true;
+            if (name === "img") {
+              const alt = (attribs.alt || "").trim();
+              if (alt) {
+                parts.push(`\n[Image: ${alt}]\n`);
+              } else {
+                parts.push("\n[Image]\n");
+              }
+            }
           },
           onclosetag(name: string) {
             if (name === "script" || name === "style") skip = false;
@@ -98,9 +112,16 @@ export class HTMLLoader {
       });
     } catch {
       // Fallback: simple regex-based tag stripping
-      return html
+      // Extract img alt text before stripping all tags
+      let processed = html
         .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
-        .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
+        .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "");
+      processed = processed.replace(/<img\b[^>]*>/gi, (tag) => {
+        const altMatch = tag.match(/alt=["']([^"']*)["']/i);
+        const alt = altMatch ? altMatch[1].trim() : "";
+        return alt ? ` [Image: ${alt}] ` : " [Image] ";
+      });
+      return processed
         .replace(/<[^>]+>/g, " ")
         .replace(/\s+/g, " ")
         .trim();
@@ -121,8 +142,16 @@ export class DOCXLoader {
     // @ts-expect-error -- optional peer dependency
     const mammoth = await import("mammoth");
     const buffer = await fs.readFile(path);
-    const result = await mammoth.extractRawText({ buffer });
-    return textToPages(result.value, this.charsPerPage);
+    const result = await mammoth.convertToHtml({ buffer });
+    // Replace <img> tags with [Image: alt] markers, then strip remaining HTML
+    let html: string = result.value;
+    html = html.replace(/<img\b[^>]*>/gi, (tag: string) => {
+      const altMatch = tag.match(/alt=["']([^"']*)["']/i);
+      const alt = altMatch ? altMatch[1].trim() : "";
+      return alt ? `[Image: ${alt}]` : "[Image]";
+    });
+    const text = html.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
+    return textToPages(text, this.charsPerPage);
   }
 }
 
@@ -140,7 +169,10 @@ const EXTENSION_MAP: Record<string, { new (): Loader }> = {
 };
 
 /** Auto-detect file format and load pages. */
-export async function autoLoader(filePath: string): Promise<Page[]> {
+export async function autoLoader(
+  filePath: string,
+  options?: { extractImages?: boolean },
+): Promise<Page[]> {
   const { extname } = await import("node:path");
   const ext = extname(filePath).toLowerCase();
   const LoaderClass = EXTENSION_MAP[ext];
@@ -150,6 +182,9 @@ export async function autoLoader(filePath: string): Promise<Page[]> {
       `Unsupported file extension '${ext}'. Supported: ${supported}`,
     );
   }
+  if (ext === ".pdf" && options?.extractImages) {
+    return new PDFLoader({ extractImages: true }).load(filePath);
+  }
   const loader = new LoaderClass();
   return loader.load(filePath);
 }