- {task.webResults.map((r, i) => (
-
-
[{i + 1}]
-
-
{r.title}
-
{r.source}
-
+
+
+ Sources ({task.webResults.length}
+ {hasCrawlResults ? ` · ${successfulCount} ${t("research.crawlDone", { count: successfulCount }).split(" ").slice(-1)}` : ""})
+
+ {hasCrawlResults && (
+
+
+ ·
+
- ))}
+ )}
+
+ {task.webResults.map((r, i) => {
+ const crawled = crawledByUrl.get(r.url)
+ const isSelected = task.selectedUrls.has(r.url)
+ const canSelect = crawled?.status === "success"
+
+ return (
+
+ )
+ })}
+
+
+ {/* Import button */}
+ {hasCrawlResults && selectedCount > 0 && (
+
+ )}
)}
diff --git a/src/i18n/en.json b/src/i18n/en.json
index 3d13c3cc..3aeb4689 100644
--- a/src/i18n/en.json
+++ b/src/i18n/en.json
@@ -280,5 +280,16 @@
"minutesAgo": "{{count}} min ago",
"hoursAgo": "{{count}} h ago",
"daysAgo": "{{count}} d ago"
+ },
+ "research": {
+ "crawlProgress": "Crawling pages... {{done}}/{{total}}",
+ "crawlDone": "{{count}} pages crawled",
+ "crawlFailed": "Crawl failed",
+ "selectAll": "Select All",
+ "deselectAll": "Deselect All",
+ "importSelected": "Import Selected ({{count}})",
+ "importingSources": "Importing {{count}} sources...",
+ "crawling": "Crawling",
+ "notCrawledYet": "Waiting to crawl"
}
}
diff --git a/src/i18n/zh.json b/src/i18n/zh.json
index fd5b06f4..da14da64 100644
--- a/src/i18n/zh.json
+++ b/src/i18n/zh.json
@@ -280,5 +280,16 @@
"minutesAgo": "{{count}} 分钟前",
"hoursAgo": "{{count}} 小时前",
"daysAgo": "{{count}} 天前"
+ },
+ "research": {
+ "crawlProgress": "正在爬取页面... {{done}}/{{total}}",
+ "crawlDone": "已爬取 {{count}} 个页面",
+ "crawlFailed": "爬取失败",
+ "selectAll": "全选",
+ "deselectAll": "取消全选",
+ "importSelected": "导入选中 ({{count}})",
+ "importingSources": "正在导入 {{count}} 个源文件...",
+ "crawling": "爬取中",
+ "notCrawledYet": "等待爬取"
}
}
diff --git a/src/lib/deep-research.ts b/src/lib/deep-research.ts
index d850ea50..cbd545c2 100644
--- a/src/lib/deep-research.ts
+++ b/src/lib/deep-research.ts
@@ -1,11 +1,14 @@
import { webSearch } from "./web-search"
import { streamChat } from "./llm-client"
import { autoIngest } from "./ingest"
-import { writeFile, readFile, listDirectory } from "@/commands/fs"
+import { writeFile, readFile, listDirectory, createDirectory } from "@/commands/fs"
import { useWikiStore, type LlmConfig, type SearchApiConfig } from "@/stores/wiki-store"
import { useResearchStore } from "@/stores/research-store"
import { normalizePath } from "@/lib/path-utils"
import { buildLanguageDirective } from "@/lib/output-language"
+import { crawlUrls } from "@/lib/web-crawler"
+import { getHttpFetch } from "@/lib/tauri-fetch"
+import { enqueueSourceIngest } from "@/lib/source-lifecycle"
/**
* Queue a deep research task. Automatically starts processing if under concurrency limit.
@@ -96,8 +99,23 @@ async function executeResearch(
return
}
- // Step 2: LLM synthesis
- store.updateTask(taskId, { status: "synthesizing" })
+ // Step 1.5: Crawl all result URLs (runs in parallel with LLM synthesis)
+ const httpFetch = await getHttpFetch()
+ const crawlPromise = crawlUrls(
+ webResults.map((r) => r.url),
+ httpFetch,
+ {
+ concurrency: 4,
+ onProgress: (done, total) => {
+ useResearchStore.getState().updateCrawlProgress(taskId, done, total)
+ },
+ },
+ ).then((pages) => {
+ useResearchStore.getState().setCrawledPages(taskId, pages)
+ })
+
+ // Step 2: LLM synthesis (runs in parallel with crawl)
+ store.updateTask(taskId, { status: "synthesizing", crawlProgress: { done: 0, total: webResults.length } })
const searchContext = webResults
.map((r, i) => `[${i + 1}] **${r.title}** (${r.source})\n${r.snippet}`)
@@ -156,6 +174,9 @@ async function executeResearch(
},
)
+ // Wait for crawl to finish before saving
+ await crawlPromise
+
// Check if errored during streaming
if (useResearchStore.getState().tasks.find((t) => t.id === taskId)?.status === "error") {
onTaskFinished(pp, llmConfig, searchConfig)
@@ -174,7 +195,7 @@ async function executeResearch(
.map((r, i) => `${i + 1}. [${r.title}](${r.url}) — ${r.source}`)
.join("\n")
- // Strip
/ blocks before saving
+ // Strip blocks before saving
const cleanedSynthesis = accumulated
.replace(/\s*[\s\S]*?<\/think(?:ing)?>\s*/gi, "")
.replace(/\s*[\s\S]*$/gi, "") // unclosed thinking block
@@ -231,6 +252,85 @@ async function executeResearch(
onTaskFinished(pp, llmConfig, searchConfig)
}
+/**
+ * Import user-selected crawled pages as source files for ingest.
+ */
+export async function importSelectedSources(
+ projectPath: string,
+ taskId: string,
+ llmConfig: LlmConfig,
+): Promise {
+ const task = useResearchStore.getState().tasks.find((t) => t.id === taskId)
+ if (!task) return []
+
+ const project = useWikiStore.getState().project
+ if (!project) return []
+
+ const pp = normalizePath(projectPath)
+ const selected = task.selectedUrls
+ if (selected.size === 0) return []
+
+ const pagesToImport = task.crawledPages.filter(
+ (p) => p.status === "success" && selected.has(p.url),
+ )
+ if (pagesToImport.length === 0) return []
+
+ const topicSlug = task.topic
+ .toLowerCase()
+ .replace(/[^a-z0-9\s-]/g, "")
+ .trim()
+ .replace(/\s+/g, "-")
+ .slice(0, 50)
+ const sourcesDir = `${pp}/raw/sources/deep-research-${topicSlug}`
+
+ await createDirectory(sourcesDir)
+
+ const importedPaths: string[] = []
+
+ for (const page of pagesToImport) {
+ const urlSlug = page.url
+ .replace(/^https?:\/\//, "")
+ .replace(/[/?#:]/g, "-")
+ .replace(/-{2,}/g, "-")
+ .replace(/^-|-$/g, "")
+ .slice(0, 80)
+ .toLowerCase()
+
+ const html = `
+
+
+
+
+
+
+
+${page.content}
+`
+
+ const filePath = `${sourcesDir}/${urlSlug}.html`
+ await writeFile(filePath, html)
+ importedPaths.push(filePath)
+ }
+
+ if (importedPaths.length > 0) {
+ await enqueueSourceIngest(
+ project,
+ importedPaths,
+ llmConfig,
+ { sourceRoot: sourcesDir, rootContext: `deep-research-${topicSlug}` },
+ )
+ }
+
+ // Clear selection after import
+ useResearchStore.getState().clearSelection(taskId)
+
+ return importedPaths
+}
+
+function escapeAttr(s: string): string {
+ return s.replace(/&/g, "&").replace(/"/g, """).replace(//g, ">")
+}
+
function onTaskFinished(
projectPath: string,
llmConfig: LlmConfig,
diff --git a/src/lib/file-types.ts b/src/lib/file-types.ts
index f1abf555..25495337 100644
--- a/src/lib/file-types.ts
+++ b/src/lib/file-types.ts
@@ -2,6 +2,7 @@ export type FileCategory =
| "markdown"
| "text"
| "code"
+ | "html"
| "image"
| "video"
| "audio"
@@ -47,8 +48,8 @@ const EXT_MAP: Record = {
css: "code",
scss: "code",
less: "code",
- html: "code",
- htm: "code",
+ html: "html",
+ htm: "html",
xml: "code",
svg: "code",
vue: "code",
@@ -127,7 +128,7 @@ export function getFileCategory(filePath: string): FileCategory {
}
export function isTextReadable(category: FileCategory): boolean {
- return ["markdown", "text", "code", "data"].includes(category)
+ return ["markdown", "text", "code", "data", "html"].includes(category)
}
export function isBinary(category: FileCategory): boolean {
diff --git a/src/lib/web-crawler.test.ts b/src/lib/web-crawler.test.ts
new file mode 100644
index 00000000..05f5567a
--- /dev/null
+++ b/src/lib/web-crawler.test.ts
@@ -0,0 +1,78 @@
+import { describe, it, expect } from "vitest"
+import { extractContentFromHtml } from "./web-crawler"
+
+const FULL_HTML = `
+
+
+
+
+
+
+
+ Test Article
+ This is the main content with bold text.
+ Second paragraph.
+
+
+`
+
+const MAIN_ONLY = `
+Main Page
+
+
+ Main content here.
+
+
+`
+
+const BODY_ONLY = `
+Body Page
+ Just body content.
+`
+
+const NO_STRUCTURE = `Plain text without any HTML structure.`
+
+const TITLE_ENTITIES = `
+Tom & Jerry <Cartoon>
+Content
+`
+
+describe("extractContentFromHtml", () => {
+ it("extracts article content and og:title", () => {
+ const result = extractContentFromHtml(FULL_HTML)
+ expect(result.title).toBe("Test Article")
+ expect(result.content).toContain("This is the main content with bold text.")
+ expect(result.content).toContain("Second paragraph.")
+ expect(result.content).not.toContain("Nav stuff")
+ expect(result.content).not.toContain("Footer links")
+ })
+
+ it("falls back to when no ", () => {
+ const result = extractContentFromHtml(MAIN_ONLY)
+ expect(result.title).toBe("Main Page")
+ expect(result.content).toContain("Main content here.")
+ expect(result.content).not.toContain("Navigation")
+ })
+
+ it("falls back to when no or ", () => {
+ const result = extractContentFromHtml(BODY_ONLY)
+ expect(result.title).toBe("Body Page")
+ expect(result.content).toContain("Just body content.")
+ })
+
+ it("handles plain text with no HTML structure", () => {
+ const result = extractContentFromHtml(NO_STRUCTURE)
+ expect(result.content).toContain("Plain text")
+ })
+
+ it("unescapes HTML entities in title", () => {
+ const result = extractContentFromHtml(TITLE_ENTITIES)
+ expect(result.title).toBe("Tom & Jerry ")
+ })
+
+ it("removes script and style tags", () => {
+ const result = extractContentFromHtml(FULL_HTML)
+ expect(result.content).not.toContain("console.log")
+ expect(result.content).not.toContain("color:red")
+ })
+})
diff --git a/src/lib/web-crawler.ts b/src/lib/web-crawler.ts
new file mode 100644
index 00000000..bc050dd3
--- /dev/null
+++ b/src/lib/web-crawler.ts
@@ -0,0 +1,128 @@
+export interface CrawledPage {
+ url: string
+ title: string
+ content: string
+ status: "success" | "failed"
+ error?: string
+}
+
+const DEFAULT_CONCURRENCY = 4
+const DEFAULT_TIMEOUT_MS = 15_000
+
+function stripTags(html: string, tags: string[]): string {
+ return tags.reduce((s, tag) => {
+ const re = new RegExp(`<${tag}[^>]*>[\\s\\S]*?<\\/${tag}>`, "gi")
+ return s.replace(re, "")
+ }, html)
+}
+
+function extractBody(html: string): string {
+ // Prefer , then , then
+ const article = /]*>([\s\S]*?)<\/article>/i.exec(html)
+ if (article) return article[1]
+
+ const main = /]*>([\s\S]*?)<\/main>/i.exec(html)
+ if (main) return main[1]
+
+ const body = /]*>([\s\S]*?)<\/body>/i.exec(html)
+ if (body) return body[1]
+
+ return html
+}
+
+function extractTitle(html: string): string {
+ const og = /]*property="og:title"[^>]*content="([^"]*)"/i.exec(html)
+ if (og) return unescapeHtml(og[1])
+
+ const title = /]*>([\s\S]*?)<\/title>/i.exec(html)
+ if (title) return unescapeHtml(title[1].trim())
+
+ const h1 = /]*>([\s\S]*?)<\/h1>/i.exec(html)
+ if (h1) return unescapeHtml(h1[1].replace(/<[^>]*>/g, "").trim())
+
+ return ""
+}
+
+function unescapeHtml(s: string): string {
+ return s
+ .replace(/&/g, "&")
+ .replace(/</g, "<")
+ .replace(/>/g, ">")
+ .replace(/"/g, '"')
+ .replace(/'/g, "'")
+ .replace(/'/g, "'")
+}
+
+const NOISE_TAGS = ["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"]
+
+export function extractContentFromHtml(html: string): { title: string; content: string } {
+ const title = extractTitle(html)
+ let body = extractBody(html)
+ body = stripTags(body, NOISE_TAGS)
+ // Collapse excessive whitespace
+ body = body.replace(/\n{3,}/g, "\n\n").trim()
+ return { title, content: body }
+}
+
+async function crawlSingle(
+ url: string,
+ httpFetch: (url: string, init?: RequestInit) => Promise,
+ timeoutMs: number,
+): Promise {
+ try {
+ const controller = new AbortController()
+ const timer = setTimeout(() => controller.abort(), timeoutMs)
+ const res = await httpFetch(url, {
+ method: "GET",
+ headers: {
+ Accept: "text/html,application/xhtml+xml,*/*",
+ "User-Agent": "Mozilla/5.0 (compatible; LLMWiki/1.0)",
+ },
+ signal: controller.signal,
+ })
+ clearTimeout(timer)
+
+ if (!res.ok) {
+ return { url, title: "", content: "", status: "failed", error: `HTTP ${res.status}` }
+ }
+
+ const ct = res.headers.get("content-type") || ""
+ if (!ct.includes("text/html") && !ct.includes("application/xhtml")) {
+ return { url, title: "", content: "", status: "failed", error: `Not HTML: ${ct}` }
+ }
+
+ const html = await res.text()
+ const { title, content } = extractContentFromHtml(html)
+ return { url, title, content, status: "success" }
+ } catch (err) {
+ const msg = err instanceof Error ? err.message : String(err)
+ return { url, title: "", content: "", status: "failed", error: msg }
+ }
+}
+
+export async function crawlUrls(
+ urls: string[],
+ httpFetch: (url: string, init?: RequestInit) => Promise,
+ options?: { concurrency?: number; timeoutMs?: number; onProgress?: (done: number, total: number) => void },
+): Promise {
+ const concurrency = options?.concurrency ?? DEFAULT_CONCURRENCY
+ const timeoutMs = options?.timeoutMs ?? DEFAULT_TIMEOUT_MS
+ const onProgress = options?.onProgress
+ const results: CrawledPage[] = new Array(urls.length)
+ let done = 0
+
+ // Process in batches of `concurrency`
+ for (let i = 0; i < urls.length; i += concurrency) {
+ const batch = urls.slice(i, i + concurrency)
+ const batchResults = await Promise.all(
+ batch.map((url) => crawlSingle(url, httpFetch, timeoutMs)),
+ )
+ batchResults.forEach((r, j) => {
+ results[i + j] = r
+ })
+ done += batch.length
+ onProgress?.(done, urls.length)
+ }
+
+ return results
+}
diff --git a/src/stores/research-store.ts b/src/stores/research-store.ts
index 954cb2f6..f84e7f05 100644
--- a/src/stores/research-store.ts
+++ b/src/stores/research-store.ts
@@ -1,16 +1,20 @@
import { create } from "zustand"
import type { WebSearchResult } from "@/lib/web-search"
+import type { CrawledPage } from "@/lib/web-crawler"
export interface ResearchTask {
id: string
topic: string
searchQueries?: string[]
- status: "queued" | "searching" | "synthesizing" | "saving" | "done" | "error"
+ status: "queued" | "searching" | "crawling" | "synthesizing" | "saving" | "done" | "error"
webResults: WebSearchResult[]
synthesis: string
savedPath: string | null
error: string | null
createdAt: number
+ crawledPages: CrawledPage[]
+ crawlProgress: { done: number; total: number } | null
+ selectedUrls: Set
}
interface ResearchState {
@@ -24,6 +28,13 @@ interface ResearchState {
setPanelOpen: (open: boolean) => void
getRunningCount: () => number
getNextQueued: () => ResearchTask | undefined
+
+ setCrawledPages: (id: string, pages: CrawledPage[]) => void
+ appendCrawledPages: (id: string, pages: CrawledPage[]) => void
+ updateCrawlProgress: (id: string, done: number, total: number) => void
+ toggleUrlSelection: (id: string, url: string) => void
+ selectAllSuccessful: (id: string) => void
+ clearSelection: (id: string) => void
}
let counter = 0
@@ -47,6 +58,9 @@ export const useResearchStore = create((set, get) => ({
savedPath: null,
error: null,
createdAt: Date.now(),
+ crawledPages: [],
+ crawlProgress: null,
+ selectedUrls: new Set(),
},
],
panelOpen: true,
@@ -69,7 +83,7 @@ export const useResearchStore = create((set, get) => ({
getRunningCount: () => {
const { tasks } = get()
return tasks.filter((t) =>
- t.status === "searching" || t.status === "synthesizing" || t.status === "saving"
+ t.status === "searching" || t.status === "crawling" || t.status === "synthesizing" || t.status === "saving"
).length
},
@@ -77,4 +91,50 @@ export const useResearchStore = create((set, get) => ({
const { tasks } = get()
return tasks.find((t) => t.status === "queued")
},
+
+ setCrawledPages: (id, pages) =>
+ set((state) => ({
+ tasks: state.tasks.map((t) => (t.id === id ? { ...t, crawledPages: pages } : t)),
+ })),
+
+ appendCrawledPages: (id, pages) =>
+ set((state) => ({
+ tasks: state.tasks.map((t) =>
+ t.id === id ? { ...t, crawledPages: [...t.crawledPages, ...pages] } : t
+ ),
+ })),
+
+ updateCrawlProgress: (id, done, total) =>
+ set((state) => ({
+ tasks: state.tasks.map((t) =>
+ t.id === id ? { ...t, crawlProgress: { done, total } } : t
+ ),
+ })),
+
+ toggleUrlSelection: (id, url) =>
+ set((state) => ({
+ tasks: state.tasks.map((t) => {
+ if (t.id !== id) return t
+ const next = new Set(t.selectedUrls)
+ if (next.has(url)) next.delete(url)
+ else next.add(url)
+ return { ...t, selectedUrls: next }
+ }),
+ })),
+
+ selectAllSuccessful: (id) =>
+ set((state) => ({
+ tasks: state.tasks.map((t) => {
+ if (t.id !== id) return t
+ const urls = t.crawledPages.filter((p) => p.status === "success").map((p) => p.url)
+ return { ...t, selectedUrls: new Set(urls) }
+ }),
+ })),
+
+ clearSelection: (id) =>
+ set((state) => ({
+ tasks: state.tasks.map((t) =>
+ t.id === id ? { ...t, selectedUrls: new Set() } : t
+ ),
+ })),
}))