diff --git a/.github/workflows/docker-smoke.yml b/.github/workflows/docker-smoke.yml index bb779091d..3d5b926c9 100644 --- a/.github/workflows/docker-smoke.yml +++ b/.github/workflows/docker-smoke.yml @@ -62,7 +62,10 @@ jobs: - name: Build and start stack run: | - SIGNET_HTTP_PORT=8080 SIGNET_HTTPS_PORT=8443 docker compose -f deploy/docker/compose.yml up -d --build + SIGNET_HTTP_PORT=8080 SIGNET_HTTPS_PORT=8443 docker compose -f deploy/docker/compose.yml up -d --build || { + docker compose -f deploy/docker/compose.yml logs signet 2>&1 || true + exit 1 + } - name: Wait for proxy readiness run: | diff --git a/docs/API.md b/docs/API.md index 5731ce8b8..b36b86ee9 100644 --- a/docs/API.md +++ b/docs/API.md @@ -1786,6 +1786,51 @@ and source chunk embeddings. Source files are not modified. } ``` +### POST /api/sources/github + +Add or update a GitHub source. Queues an async sync that indexes issues, pull +requests, discussions, and/or docs from one or more repos into the knowledge +graph and embedding store. Requires `admin` permission. + +**Request body** + +```json +{ + "name": "Signet Issues", + "tokenRef": "GITHUB_TOKEN", + "repos": ["Signet-AI/signetai"], + "resourceTypes": ["issues", "pulls", "discussions", "docs"], + "state": "all", + "includeComments": true, + "labels": ["bug", "feature"], + "maxItemsPerRepo": 500, + "docPaths": ["README.md", "CHANGELOG.md"] +} +``` + +**Response** + +```json +{ + "source": { "id": "github:abc123def456", "kind": "github" }, + "created": true +} +``` + +### DELETE /api/sources/:sourceId + +Remove a source config and purge Signet-owned source artifacts, graph rows, +and source chunk embeddings. Source files are not modified. + +**Response** + +```json +{ + "source": { "id": "obsidian:abc123", "kind": "obsidian" }, + "purged": 150 +} +``` + ### POST /api/sources/pick-directory Best-effort local directory picker used by dashboard/browser flows. It returns @@ -4489,6 +4534,7 @@ silently disappear from the API reference. | GET | `/api/sources` | platform/daemon/src/routes/sources-routes.ts | | POST | `/api/sources/pick-directory` | platform/daemon/src/routes/sources-routes.ts | | POST | `/api/sources/obsidian` | platform/daemon/src/routes/sources-routes.ts | +| POST | `/api/sources/github` | platform/daemon/src/routes/sources-routes.ts | | DELETE | `/api/sources/:sourceId` | platform/daemon/src/routes/sources-routes.ts | | GET | `/api/knowledge/entities` | platform/daemon/src/routes/knowledge-routes.ts | | POST | `/api/knowledge/entities/:id/pin` | platform/daemon/src/routes/knowledge-routes.ts | diff --git a/platform/core/src/index.ts b/platform/core/src/index.ts index 3dfa0384c..8fd0665dd 100644 --- a/platform/core/src/index.ts +++ b/platform/core/src/index.ts @@ -221,18 +221,24 @@ export type { WorkspaceSourceRepoSyncResult, } from "./workspace-source-repo"; export { + addGitHubSource, addObsidianSource, + DEFAULT_GITHUB_DOC_PATHS, + DEFAULT_GITHUB_RESOURCE_TYPES, DEFAULT_OBSIDIAN_EXCLUDE_GLOBS, getAgentsDir, getSourcesConfigPath, loadSourcesConfig, markSourceIndexed, + parseGitHubSettings, removeSource, saveSourcesConfig, } from "./sources-config"; export type { + AddGitHubSourceInput, AddObsidianSourceInput, AddSourceResult, + GitHubSourceSettings, RemoveSourceResult, SignetSourceEntry, SignetSourceKind, diff --git a/platform/core/src/sources-config.test.ts b/platform/core/src/sources-config.test.ts index 2e1e58550..f859f87db 100644 --- a/platform/core/src/sources-config.test.ts +++ b/platform/core/src/sources-config.test.ts @@ -4,10 +4,12 @@ import { tmpdir } from "node:os"; import { join } from "node:path"; import { DEFAULT_OBSIDIAN_EXCLUDE_GLOBS, + addGitHubSource, addObsidianSource, getSourcesConfigPath, loadSourcesConfig, markSourceIndexed, + parseGitHubSettings, removeSource, } from "./sources-config"; @@ -152,4 +154,363 @@ describe("sources-config", () => { if (removed.ok === true) throw new Error("expected removeSource to fail"); expect(removed.error).toContain("not found"); }); + + describe("GitHub source", () => { + it("adds a GitHub source with repos and token ref", () => { + const agentsDir = tmp(); + const result = addGitHubSource( + { + repos: ["Signet-AI/signetai", "Signet-AI/sqmd"], + name: "Signet Repos", + tokenRef: "github-pat", + now: "2026-01-01T00:00:00.000Z", + }, + agentsDir, + ); + expect(result.ok).toBe(true); + if (result.ok === false) throw new Error(result.error); + expect(result.created).toBe(true); + expect(result.source.kind).toBe("github"); + expect(result.source.mode).toBe("read-only"); + expect(result.source.enabled).toBe(true); + expect(result.source.name).toBe("Signet Repos"); + + const config = loadSourcesConfig(agentsDir); + expect(config.sources).toHaveLength(1); + const settings = parseGitHubSettings(config.sources[0]?.settings); + expect(settings.repos).toEqual(["Signet-AI/signetai", "Signet-AI/sqmd"]); + expect(settings.tokenRef).toBe("github-pat"); + }); + + it("updates an existing GitHub source instead of duplicating", () => { + const agentsDir = tmp(); + const first = addGitHubSource( + { repos: ["owner/repo"], name: "Repo A", now: "2026-01-01T00:00:00.000Z" }, + agentsDir, + ); + const second = addGitHubSource( + { repos: ["owner/repo"], name: "Repo B", now: "2026-01-02T00:00:00.000Z" }, + agentsDir, + ); + expect(first.ok).toBe(true); + expect(second.ok).toBe(true); + if (second.ok === false) throw new Error(second.error); + expect(second.created).toBe(false); + expect(second.source.name).toBe("Repo B"); + expect(loadSourcesConfig(agentsDir).sources).toHaveLength(1); + }); + + it("preserves existing GitHub settings during partial updates", () => { + const agentsDir = tmp(); + const first = addGitHubSource( + { + repos: ["owner/repo"], + name: "Repo A", + tokenRef: "GITHUB_TOKEN", + resourceTypes: ["issues", "discussions"], + state: "closed", + includeComments: false, + labels: ["bug", "needs triage"], + docPaths: ["docs/setup.md"], + maxItemsPerRepo: 42, + now: "2026-01-01T00:00:00.000Z", + }, + agentsDir, + ); + expect(first.ok).toBe(true); + if (first.ok === false) throw new Error(first.error); + + const second = addGitHubSource( + { repos: ["owner/repo"], name: "Repo B", now: "2026-01-02T00:00:00.000Z" }, + agentsDir, + ); + expect(second.ok).toBe(true); + if (second.ok === false) throw new Error(second.error); + + expect(second.created).toBe(false); + expect(second.source.name).toBe("Repo B"); + const settings = parseGitHubSettings(second.source.settings); + expect(settings.tokenRef).toBe("GITHUB_TOKEN"); + expect(settings.resourceTypes).toEqual(["issues", "discussions"]); + expect(settings.state).toBe("closed"); + expect(settings.includeComments).toBe(false); + expect(settings.labels).toEqual(["bug", "needs triage"]); + expect(settings.docPaths).toEqual(["docs/setup.md"]); + expect(settings.maxItemsPerRepo).toBe(42); + }); + + it("keeps identical GitHub repo sets separate per agent", () => { + const agentsDir = tmp(); + const first = addGitHubSource( + { repos: ["owner/repo"], name: "Agent A Repo", agentId: "agent-a", now: "2026-01-01T00:00:00.000Z" }, + agentsDir, + ); + const second = addGitHubSource( + { repos: ["owner/repo"], name: "Agent B Repo", agentId: "agent-b", now: "2026-01-02T00:00:00.000Z" }, + agentsDir, + ); + + expect(first.ok).toBe(true); + expect(second.ok).toBe(true); + if (first.ok === false || second.ok === false) throw new Error("expected both sources to be added"); + expect(first.source.id).not.toBe(second.source.id); + expect(loadSourcesConfig(agentsDir).sources.map((source) => source.agentId)).toEqual(["agent-a", "agent-b"]); + }); + + it("defaults GitHub sources to the current SIGNET_AGENT_ID when caller omits agentId", () => { + const agentsDir = tmp(); + const previousAgentId = process.env.SIGNET_AGENT_ID; + process.env.SIGNET_AGENT_ID = "agent-env"; + try { + const result = addGitHubSource({ repos: ["owner/repo"], now: "2026-01-01T00:00:00.000Z" }, agentsDir); + expect(result.ok).toBe(true); + if (result.ok === false) throw new Error(result.error); + expect(result.source.agentId).toBe("agent-env"); + expect(loadSourcesConfig(agentsDir).sources[0]?.agentId).toBe("agent-env"); + } finally { + if (previousAgentId === undefined) Reflect.deleteProperty(process.env, "SIGNET_AGENT_ID"); + else process.env.SIGNET_AGENT_ID = previousAgentId; + } + }); + + it("defaults unauthenticated GitHub sources to REST-fetchable resource types", () => { + const agentsDir = tmp(); + const result = addGitHubSource({ repos: ["owner/repo"] }, agentsDir); + expect(result.ok).toBe(true); + if (result.ok === false) throw new Error(result.error); + expect(result.source.settings?.resourceTypes).toEqual(["issues", "pulls", "docs"]); + }); + + it("keeps discussion resource types in the default set when a tokenRef is provided", () => { + const agentsDir = tmp(); + const result = addGitHubSource({ repos: ["owner/repo"], tokenRef: "GITHUB_TOKEN" }, agentsDir); + expect(result.ok).toBe(true); + if (result.ok === false) throw new Error(result.error); + expect(result.source.settings?.resourceTypes).toEqual(["issues", "pulls", "discussions", "docs"]); + }); + + it("rejects an explicit empty GitHub resource type list", () => { + const agentsDir = tmp(); + const result = addGitHubSource({ repos: ["owner/repo"], resourceTypes: [] }, agentsDir); + expect(result.ok).toBe(false); + if (result.ok === true) throw new Error("expected failure"); + expect(result.error).toContain("resourceTypes"); + }); + + it("rejects a non-integer GitHub max item limit", () => { + const agentsDir = tmp(); + const result = addGitHubSource({ repos: ["owner/repo"], maxItemsPerRepo: 1.5 }, agentsDir); + expect(result.ok).toBe(false); + if (result.ok === true) throw new Error("expected failure"); + expect(result.error).toContain("integer"); + }); + + it("rejects invalid GitHub runtime fields before writing config", () => { + const agentsDir = tmp(); + const state = addGitHubSource({ repos: ["owner/repo"], state: "draft" as never }, agentsDir); + expect(state.ok).toBe(false); + if (state.ok === true) throw new Error("expected state failure"); + expect(state.error).toContain("state"); + + const labels = addGitHubSource({ repos: ["owner/repo"], labels: ["bug", 123] as never }, agentsDir); + expect(labels.ok).toBe(false); + if (labels.ok === true) throw new Error("expected labels failure"); + expect(labels.error).toContain("labels"); + + const comments = addGitHubSource({ repos: ["owner/repo"], includeComments: "yes" as never }, agentsDir); + expect(comments.ok).toBe(false); + if (comments.ok === true) throw new Error("expected comments failure"); + expect(comments.error).toContain("includeComments"); + + expect(loadSourcesConfig(agentsDir).sources).toEqual([]); + }); + + it("rejects unsafe GitHub doc paths before writing config", () => { + const agentsDir = tmp(); + for (const docPath of [ + "/README.md", + "../README.md", + "docs/../README.md", + "README.md?ref=dev", + "src/daemon.ts", + "docs/openapi.yaml", + ]) { + const result = addGitHubSource({ repos: ["owner/repo"], docPaths: [docPath] }, agentsDir); + expect(result.ok).toBe(false); + if (result.ok === true) throw new Error("expected doc path failure"); + expect(result.error).toContain("docPaths"); + } + expect(loadSourcesConfig(agentsDir).sources).toEqual([]); + }); + + it("accepts markdown doc paths and markdown globs", () => { + const agentsDir = tmp(); + for (const docPath of ["README.md", "docs/setup.md", "docs/*.md", "docs/**/*.md"]) { + const result = addGitHubSource({ repos: ["owner/repo"], docPaths: [docPath] }, agentsDir); + expect(result.ok).toBe(true); + if (result.ok === false) throw new Error(result.error); + removeSource(result.source.id, agentsDir); + } + }); + + it("drops malformed persisted GitHub sources instead of treating them as empty sources", () => { + const agentsDir = tmp(); + writeFileSync( + getSourcesConfigPath(agentsDir), + JSON.stringify({ + version: 1, + sources: [ + { + id: "github:bad", + kind: "github", + name: "Bad GitHub", + root: "", + enabled: true, + mode: "read-only", + createdAt: "2026-01-01T00:00:00.000Z", + updatedAt: "2026-01-01T00:00:00.000Z", + }, + ], + }), + "utf8", + ); + + expect(loadSourcesConfig(agentsDir).sources).toEqual([]); + }); + + it("drops persisted GitHub sources with invalid resource types instead of widening to defaults", () => { + const agentsDir = tmp(); + writeFileSync( + getSourcesConfigPath(agentsDir), + JSON.stringify({ + version: 1, + sources: [ + { + id: "github:bad-types", + kind: "github", + name: "Bad GitHub Types", + root: "", + enabled: true, + mode: "read-only", + createdAt: "2026-01-01T00:00:00.000Z", + updatedAt: "2026-01-01T00:00:00.000Z", + settings: { repos: ["owner/repo"], resourceTypes: ["issue"] }, + }, + ], + }), + "utf8", + ); + + expect(loadSourcesConfig(agentsDir).sources).toEqual([]); + }); + + it("drops persisted GitHub sources without an owning agent", () => { + const agentsDir = tmp(); + writeFileSync( + getSourcesConfigPath(agentsDir), + JSON.stringify({ + version: 1, + sources: [ + { + id: "github:unscoped", + kind: "github", + name: "Unscoped GitHub", + root: "", + enabled: true, + mode: "read-only", + createdAt: "2026-01-01T00:00:00.000Z", + updatedAt: "2026-01-01T00:00:00.000Z", + settings: { repos: ["owner/repo"] }, + }, + ], + }), + "utf8", + ); + + expect(loadSourcesConfig(agentsDir).sources).toEqual([]); + }); + + it("drops persisted GitHub sources with non-integer max item limits", () => { + const agentsDir = tmp(); + writeFileSync( + getSourcesConfigPath(agentsDir), + JSON.stringify({ + version: 1, + sources: [ + { + id: "github:bad-max", + kind: "github", + name: "Bad GitHub Max", + root: "", + enabled: true, + mode: "read-only", + createdAt: "2026-01-01T00:00:00.000Z", + updatedAt: "2026-01-01T00:00:00.000Z", + settings: { repos: ["owner/repo"], maxItemsPerRepo: 1.5 }, + }, + ], + }), + "utf8", + ); + + expect(loadSourcesConfig(agentsDir).sources).toEqual([]); + }); + + it("drops persisted GitHub sources with unsafe doc paths", () => { + const agentsDir = tmp(); + writeFileSync( + getSourcesConfigPath(agentsDir), + JSON.stringify({ + version: 1, + sources: [ + { + id: "github:bad-doc-path", + kind: "github", + name: "Bad GitHub Docs", + root: "", + enabled: true, + mode: "read-only", + createdAt: "2026-01-01T00:00:00.000Z", + updatedAt: "2026-01-01T00:00:00.000Z", + agentId: "default", + settings: { repos: ["owner/repo"], docPaths: ["src/daemon.ts"] }, + }, + ], + }), + "utf8", + ); + + expect(loadSourcesConfig(agentsDir).sources).toEqual([]); + }); + + it("requires at least one repo", () => { + const agentsDir = tmp(); + const result = addGitHubSource({ repos: [] }, agentsDir); + expect(result.ok).toBe(false); + if (result.ok === true) throw new Error("expected failure"); + expect(result.error).toContain("repo"); + }); + + it("coexists with Obsidian sources", () => { + const agentsDir = tmp(); + const vault = join(agentsDir, "vault"); + mkdirSync(vault, { recursive: true }); + + addObsidianSource({ root: vault, name: "My Vault" }, agentsDir); + addGitHubSource({ repos: ["owner/repo"], name: "GitHub" }, agentsDir); + + const config = loadSourcesConfig(agentsDir); + expect(config.sources).toHaveLength(2); + expect(config.sources.map((s) => s.kind)).toEqual(["obsidian", "github"]); + }); + + it("defaults unauthenticated resource types to REST-fetchable types", () => { + const agentsDir = tmp(); + const result = addGitHubSource({ repos: ["owner/repo"] }, agentsDir); + expect(result.ok).toBe(true); + if (result.ok === false) throw new Error(result.error); + const settings = parseGitHubSettings(loadSourcesConfig(agentsDir).sources[0]?.settings); + expect(settings.resourceTypes).toEqual(["issues", "pulls", "docs"]); + }); + }); }); diff --git a/platform/core/src/sources-config.ts b/platform/core/src/sources-config.ts index 424cb7cdd..40824661b 100644 --- a/platform/core/src/sources-config.ts +++ b/platform/core/src/sources-config.ts @@ -3,9 +3,25 @@ import { existsSync, mkdirSync, readFileSync, renameSync, rmSync, statSync, writ import { homedir } from "node:os"; import { dirname, resolve } from "node:path"; -export type SignetSourceKind = "obsidian"; +export type SignetSourceKind = "obsidian" | "github"; export type SignetSourceMode = "read-only"; +export interface GitHubSourceSettings { + readonly repos: readonly string[]; + readonly tokenRef?: string; + readonly resourceTypes: readonly ("issues" | "pulls" | "discussions" | "docs")[]; + readonly state?: "open" | "closed" | "all"; + readonly includeComments?: boolean; + readonly labels?: readonly string[]; + readonly docPaths?: readonly string[]; + readonly maxItemsPerRepo?: number; +} + +export const DEFAULT_GITHUB_RESOURCE_TYPES = ["issues", "pulls", "discussions", "docs"] as const; +export const DEFAULT_GITHUB_RESOURCE_TYPES_NO_TOKEN = ["issues", "pulls", "docs"] as const; +const VALID_GITHUB_RESOURCE_TYPES = new Set(DEFAULT_GITHUB_RESOURCE_TYPES); +export const DEFAULT_GITHUB_DOC_PATHS = ["README.md", "CHANGELOG.md"] as const; + export interface SignetSourceEntry { readonly id: string; readonly kind: SignetSourceKind; @@ -17,6 +33,8 @@ export interface SignetSourceEntry { readonly updatedAt: string; readonly lastIndexedAt?: string; readonly excludeGlobs?: readonly string[]; + readonly settings?: Readonly>; + readonly agentId?: string; } export const DEFAULT_OBSIDIAN_EXCLUDE_GLOBS = [ @@ -39,6 +57,20 @@ export interface AddObsidianSourceInput { readonly now?: string; } +export interface AddGitHubSourceInput { + readonly repos: readonly string[]; + readonly name?: string; + readonly tokenRef?: string; + readonly resourceTypes?: readonly ("issues" | "pulls" | "discussions" | "docs")[]; + readonly state?: "open" | "closed" | "all"; + readonly includeComments?: boolean; + readonly labels?: readonly string[]; + readonly docPaths?: readonly string[]; + readonly maxItemsPerRepo?: number; + readonly now?: string; + readonly agentId?: string; +} + export type AddSourceResult = | { readonly ok: true; readonly source: SignetSourceEntry; readonly created: boolean } | { readonly ok: false; readonly error: string }; @@ -163,6 +195,177 @@ function addObsidianSourceChecked(input: AddObsidianSourceInput, agentsDir = get return { ok: true, source, created: true }; } +export function addGitHubSource(input: AddGitHubSourceInput, agentsDir = getAgentsDir()): AddSourceResult { + return withSourcesConfigLock(agentsDir, () => addGitHubSourceUnlocked(input, agentsDir)); +} + +function resolveSourceAgentId(agentId?: string): string { + return cleanName(agentId) ?? cleanName(process.env.SIGNET_AGENT_ID) ?? "default"; +} + +function addGitHubSourceUnlocked(input: AddGitHubSourceInput, agentsDir = getAgentsDir()): AddSourceResult { + try { + return addGitHubSourceChecked(input, agentsDir); + } catch (err) { + const detail = err instanceof Error ? err.message : String(err); + return { ok: false, error: detail }; + } +} + +function addGitHubSourceChecked(input: AddGitHubSourceInput, agentsDir = getAgentsDir()): AddSourceResult { + const repos = input.repos.map((r) => r.trim()).filter(Boolean); + if (repos.length === 0) return { ok: false, error: "At least one repo (owner/repo or owner/*) is required" }; + for (const repo of repos) { + if (!/^[a-zA-Z0-9_.-]+\/[a-zA-Z0-9_*.-]+$/.test(repo)) { + return { ok: false, error: `Invalid repo pattern: ${repo}. Expected owner/repo or owner/*` }; + } + } + + if (input.maxItemsPerRepo !== undefined) { + if (!Number.isInteger(input.maxItemsPerRepo) || input.maxItemsPerRepo < 1 || input.maxItemsPerRepo > 10000) { + return { ok: false, error: "maxItemsPerRepo must be an integer between 1 and 10000" }; + } + } + + if (input.resourceTypes) { + if (!Array.isArray(input.resourceTypes)) { + return { ok: false, error: "resourceTypes must be an array" }; + } + if (input.resourceTypes.length === 0) { + return { ok: false, error: "resourceTypes must include at least one resource type" }; + } + const invalid = input.resourceTypes.filter((t) => !VALID_GITHUB_RESOURCE_TYPES.has(t)); + if (invalid.length > 0) { + return { + ok: false, + error: `Invalid resource types: ${invalid.join(", ")}. Must be one of: ${[...DEFAULT_GITHUB_RESOURCE_TYPES].join(", ")}`, + }; + } + } + if (input.tokenRef !== undefined && typeof input.tokenRef !== "string") { + return { ok: false, error: "tokenRef must be a string" }; + } + if (input.state !== undefined && input.state !== "open" && input.state !== "closed" && input.state !== "all") { + return { ok: false, error: "state must be one of: open, closed, all" }; + } + if (input.includeComments !== undefined && typeof input.includeComments !== "boolean") { + return { ok: false, error: "includeComments must be a boolean" }; + } + if (input.labels !== undefined && !isStringArray(input.labels)) { + return { ok: false, error: "labels must be an array of strings" }; + } + if (input.docPaths !== undefined) { + if (!isStringArray(input.docPaths)) return { ok: false, error: "docPaths must be an array of strings" }; + const invalid = input.docPaths.filter((path) => !isSafeGitHubDocPath(path)); + if (invalid.length > 0) { + return { ok: false, error: `Invalid docPaths: ${invalid.join(", ")}` }; + } + } + + const now = input.now ?? new Date().toISOString(); + const cfg = loadSourcesConfigForWrite(agentsDir); + const agentId = resolveSourceAgentId(input.agentId); + const settingsKey = [...repos].sort().join(","); + const existing = cfg.sources.find( + (source) => + source.kind === "github" && + (source.agentId ?? "default") === agentId && + Array.isArray(source.settings?.repos) && + [...(source.settings.repos as string[])].sort().join(",") === settingsKey, + ); + + if (existing) { + const existingSettings = parseGitHubSettings(existing.settings); + const updated: SignetSourceEntry = { + ...existing, + name: cleanName(input.name) ?? existing.name, + enabled: true, + updatedAt: now, + settings: buildGitHubSettings(input, repos, existingSettings), + agentId, + }; + saveSourcesConfig( + { + version: SOURCES_CONFIG_VERSION, + sources: cfg.sources.map((source) => (source.id === existing.id ? updated : source)), + }, + agentsDir, + ); + return { ok: true, source: updated, created: false }; + } + + const source: SignetSourceEntry = { + id: `github:${createHash("sha256").update(`${agentId}\0${settingsKey}`).digest("hex").slice(0, 16)}`, + kind: "github", + name: cleanName(input.name) ?? repos[0], + root: "", + enabled: true, + mode: "read-only", + createdAt: now, + updatedAt: now, + settings: buildGitHubSettings(input, repos), + agentId, + }; + saveSourcesConfig({ version: SOURCES_CONFIG_VERSION, sources: [...cfg.sources, source] }, agentsDir); + return { ok: true, source, created: true }; +} + +function buildGitHubSettings( + input: AddGitHubSourceInput, + repos: readonly string[], + existing?: GitHubSourceSettings, +): Readonly> { + const tokenRef = input.tokenRef !== undefined ? input.tokenRef.trim() || undefined : existing?.tokenRef; + const resourceTypes = input.resourceTypes + ? [...input.resourceTypes] + : existing?.resourceTypes?.length + ? [...existing.resourceTypes] + : tokenRef + ? [...DEFAULT_GITHUB_RESOURCE_TYPES] + : [...DEFAULT_GITHUB_RESOURCE_TYPES_NO_TOKEN]; + return { + repos: repos, + tokenRef, + resourceTypes, + state: input.state ?? existing?.state ?? "all", + includeComments: input.includeComments ?? existing?.includeComments ?? true, + labels: input.labels !== undefined ? cleanStringArray(input.labels) : existing?.labels, + docPaths: input.docPaths !== undefined ? cleanStringArray(input.docPaths) : (existing?.docPaths ?? [...DEFAULT_GITHUB_DOC_PATHS]), + maxItemsPerRepo: input.maxItemsPerRepo ?? existing?.maxItemsPerRepo ?? 500, + }; +} + +export function parseGitHubSettings(raw: Readonly> | undefined): GitHubSourceSettings { + if (!raw) { + return { repos: [], resourceTypes: [...DEFAULT_GITHUB_RESOURCE_TYPES] }; + } + const repos = + Array.isArray(raw.repos) && raw.repos.every((r) => typeof r === "string") ? (raw.repos as string[]) : []; + let resourceTypes = + Array.isArray(raw.resourceTypes) && raw.resourceTypes.every((t) => typeof t === "string") + ? (raw.resourceTypes as string[]).filter((t): t is "issues" | "pulls" | "discussions" | "docs" => + ["issues", "pulls", "discussions", "docs"].includes(t), + ) + : [...DEFAULT_GITHUB_RESOURCE_TYPES]; + if (resourceTypes.length === 0) resourceTypes = [...DEFAULT_GITHUB_RESOURCE_TYPES]; + return { + repos, + tokenRef: typeof raw.tokenRef === "string" ? raw.tokenRef : undefined, + resourceTypes, + state: raw.state === "open" || raw.state === "closed" || raw.state === "all" ? raw.state : "all", + includeComments: typeof raw.includeComments === "boolean" ? raw.includeComments : true, + labels: + Array.isArray(raw.labels) && raw.labels.every((l) => typeof l === "string") + ? (raw.labels as string[]) + : undefined, + docPaths: + Array.isArray(raw.docPaths) && raw.docPaths.every((p) => typeof p === "string") + ? (raw.docPaths as string[]) + : [...DEFAULT_GITHUB_DOC_PATHS], + maxItemsPerRepo: typeof raw.maxItemsPerRepo === "number" && raw.maxItemsPerRepo > 0 ? raw.maxItemsPerRepo : 500, + }; +} + export function markSourceIndexed( sourceId: string, indexedAt = new Date().toISOString(), @@ -259,6 +462,31 @@ function cleanExcludeGlobs(values: readonly string[] | undefined): readonly stri return cleaned.length > 0 ? cleaned : []; } +function cleanStringArray(values: readonly string[]): readonly string[] { + return Array.from(new Set(values.map((value) => value.trim()).filter(Boolean))); +} + +function isStringArray(value: unknown): value is readonly string[] { + return Array.isArray(value) && value.every((entry) => typeof entry === "string"); +} + +function isMarkdownDocPath(path: string): boolean { + return path.toLowerCase().endsWith(".md"); +} + +function isMarkdownDocGlob(path: string): boolean { + const lowered = path.toLowerCase(); + return lowered.endsWith("/*.md") || lowered.endsWith("/**/*.md"); +} + +function isSafeGitHubDocPath(value: string): boolean { + const path = value.trim(); + if (!path) return false; + if (path.startsWith("/") || path.includes("\\") || path.includes("?") || path.includes("#")) return false; + if (path.split("/").some((segment) => segment === "" || segment === "." || segment === "..")) return false; + return isMarkdownDocPath(path) || isMarkdownDocGlob(path); +} + function mergeDefaultObsidianExcludeGlobs(values: readonly string[] | undefined): readonly string[] { return [...DEFAULT_OBSIDIAN_EXCLUDE_GLOBS, ...(cleanExcludeGlobs(values) ?? [])].filter( (value, index, all) => all.indexOf(value) === index, @@ -270,18 +498,76 @@ function isRecord(value: unknown): value is Record { } function isSourceEntry(value: unknown): value is SignetSourceEntry { - return ( - isRecord(value) && - value.kind === "obsidian" && - typeof value.id === "string" && - typeof value.name === "string" && - typeof value.root === "string" && - typeof value.enabled === "boolean" && - value.mode === "read-only" && - typeof value.createdAt === "string" && - typeof value.updatedAt === "string" && - (value.lastIndexedAt === undefined || typeof value.lastIndexedAt === "string") && - (value.excludeGlobs === undefined || - (Array.isArray(value.excludeGlobs) && value.excludeGlobs.every((entry) => typeof entry === "string"))) - ); + if ( + !( + isRecord(value) && + (value.kind === "obsidian" || value.kind === "github") && + typeof value.id === "string" && + typeof value.name === "string" && + typeof value.root === "string" && + typeof value.enabled === "boolean" && + value.mode === "read-only" && + typeof value.createdAt === "string" && + typeof value.updatedAt === "string" && + (value.lastIndexedAt === undefined || typeof value.lastIndexedAt === "string") && + (value.excludeGlobs === undefined || + (Array.isArray(value.excludeGlobs) && value.excludeGlobs.every((entry) => typeof entry === "string"))) && + (value.settings === undefined || isRecord(value.settings)) && + (value.agentId === undefined || typeof value.agentId === "string") + ) + ) { + return false; + } + if (value.kind === "github") { + return typeof value.agentId === "string" && isValidGitHubSettingsRecord(value.settings); + } + return true; +} + +function isValidGitHubSettingsRecord(value: unknown): value is Readonly> { + if (!isRecord(value)) return false; + if ( + !Array.isArray(value.repos) || + value.repos.length === 0 || + !value.repos.every((repo) => typeof repo === "string") + ) { + return false; + } + if (value.tokenRef !== undefined && typeof value.tokenRef !== "string") return false; + if (value.includeComments !== undefined && typeof value.includeComments !== "boolean") return false; + if (value.state !== undefined && value.state !== "open" && value.state !== "closed" && value.state !== "all") { + return false; + } + if (value.resourceTypes !== undefined) { + if ( + !Array.isArray(value.resourceTypes) || + value.resourceTypes.length === 0 || + !value.resourceTypes.every((type) => typeof type === "string" && VALID_GITHUB_RESOURCE_TYPES.has(type)) + ) { + return false; + } + } + if (value.labels !== undefined) { + if (!Array.isArray(value.labels) || !value.labels.every((label) => typeof label === "string")) return false; + } + if (value.docPaths !== undefined) { + if ( + !Array.isArray(value.docPaths) || + !value.docPaths.every((path) => typeof path === "string" && isSafeGitHubDocPath(path)) + ) { + return false; + } + } + if (value.maxItemsPerRepo !== undefined) { + const maxItemsPerRepo = value.maxItemsPerRepo; + if ( + typeof maxItemsPerRepo !== "number" || + !Number.isInteger(maxItemsPerRepo) || + maxItemsPerRepo < 1 || + maxItemsPerRepo > 10000 + ) { + return false; + } + } + return true; } diff --git a/platform/daemon/src/daemon.ts b/platform/daemon/src/daemon.ts index dc043cf88..95719e05f 100644 --- a/platform/daemon/src/daemon.ts +++ b/platform/daemon/src/daemon.ts @@ -40,6 +40,7 @@ import { fetchEmbedding } from "./embedding-fetch"; import { type EmbeddingTrackerHandle, startEmbeddingTracker } from "./embedding-tracker"; import { initFeatureFlags } from "./feature-flags"; import { writeFileIfChangedAsync } from "./file-sync"; +import { type GitHubSourceBridgeHandle, startGitHubSourceBridge } from "./github-source-bridge"; import { createSignetHttpServer } from "./http-server"; import { syncAgentWorkspaces } from "./identity-sync"; import { getOrCreateInferenceRouter } from "./inference-router.js"; @@ -243,6 +244,7 @@ setupDashboardRoutes(app); let watcher: ReturnType | null = null; let nativeMemoryBridge: NativeMemoryBridgeHandle | null = null; +let githubSourceBridge: GitHubSourceBridgeHandle | null = null; // Track ingested files to avoid re-processing (path -> content hash) const ingestedMemoryFiles = new Map(); @@ -1189,6 +1191,11 @@ async function cleanup() { nativeMemoryBridge = null; } + if (githubSourceBridge) { + await githubSourceBridge.close(); + githubSourceBridge = null; + } + if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = undefined; @@ -1610,6 +1617,27 @@ async function main() { }); } + if (!githubSourceBridge) { + const embeddingCfg = memoryCfg.embedding.provider !== "none" ? memoryCfg.embedding : undefined; + githubSourceBridge = startGitHubSourceBridge( + () => loadSourcesConfig(AGENTS_DIR).sources.filter((s) => s.enabled && s.kind === "github"), + { + agentsDir: AGENTS_DIR, + pollIntervalMs: 300_000, + embeddingConfig: embeddingCfg, + fetchEmbedding: embeddingCfg ? fetchEmbedding : undefined, + }, + ); + githubSourceBridge.sync().catch((e) => { + logger.error( + "daemon", + "Failed to sync GitHub sources", + undefined, + e instanceof Error ? { message: e.message, stack: e.stack } : { error: String(e) }, + ); + }); + } + const startupCfg = loadMemoryConfig(AGENTS_DIR); if (startupCfg.embedding.provider !== "none") { checkEmbeddingProvider(startupCfg.embedding) diff --git a/platform/daemon/src/github-source-bridge.test.ts b/platform/daemon/src/github-source-bridge.test.ts new file mode 100644 index 000000000..3d6213a78 --- /dev/null +++ b/platform/daemon/src/github-source-bridge.test.ts @@ -0,0 +1,268 @@ +import { afterEach, describe, expect, mock, test } from "bun:test"; +import { mkdirSync, mkdtempSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { type SignetSourceEntry, loadSourcesConfig, saveSourcesConfig } from "../../core/src/sources-config"; +import { closeDbAccessor, initDbAccessor } from "./db-accessor"; +import { startGitHubSourceBridge } from "./github-source-bridge"; +import { clearSourceIndexProgressForTests, getSourceIndexJob } from "./source-index-progress"; + +const originalFetch = globalThis.fetch; + +afterEach(() => { + globalThis.fetch = originalFetch; + clearSourceIndexProgressForTests(); + closeDbAccessor(); +}); + +describe("startGitHubSourceBridge", () => { + test("marks GitHub sources indexed and completes the source job after a successful sync", async () => { + const agentsDir = mkdtempSync(join(tmpdir(), "signet-github-bridge-")); + mkdirSync(agentsDir, { recursive: true }); + initDbAccessor(join(agentsDir, "memories.db")); + const source: SignetSourceEntry = { + id: "github:test", + kind: "github", + name: "Test Repo", + root: "", + enabled: true, + mode: "read-only", + createdAt: "2026-01-01T00:00:00.000Z", + updatedAt: "2026-01-01T00:00:00.000Z", + agentId: "default", + settings: { + repos: ["Signet-AI/signetai"], + resourceTypes: ["issues"], + maxItemsPerRepo: 1, + }, + }; + saveSourcesConfig({ version: 1, sources: [source] }, agentsDir); + + globalThis.fetch = mock(async (input: string | URL | Request) => { + const url = String(input); + if (url.endsWith("/repos/Signet-AI/signetai")) { + return new Response( + JSON.stringify({ + owner: { login: "Signet-AI" }, + name: "signetai", + full_name: "Signet-AI/signetai", + default_branch: "main", + html_url: "https://github.com/Signet-AI/signetai", + }), + { status: 200 }, + ); + } + if (url.includes("/repos/Signet-AI/signetai/issues?")) { + return new Response(JSON.stringify([]), { status: 200 }); + } + throw new Error(`Unexpected fetch: ${url}`); + }) as typeof fetch; + + const bridge = startGitHubSourceBridge([source], { agentsDir, pollIntervalMs: 0 }); + try { + const indexed = await bridge.sync(); + expect(indexed).toBe(0); + } finally { + await bridge.close(); + } + + const saved = loadSourcesConfig(agentsDir).sources[0]; + expect(saved?.lastIndexedAt).toBeString(); + expect(getSourceIndexJob(source.id)).toMatchObject({ status: "complete", indexed: 0 }); + }); + + test("marks the source job failed when sync completes with partial errors", async () => { + const agentsDir = mkdtempSync(join(tmpdir(), "signet-github-bridge-errors-")); + mkdirSync(agentsDir, { recursive: true }); + initDbAccessor(join(agentsDir, "memories.db")); + const source: SignetSourceEntry = { + id: "github:test-errors", + kind: "github", + name: "Test Repo", + root: "", + enabled: true, + mode: "read-only", + createdAt: "2026-01-01T00:00:00.000Z", + updatedAt: "2026-01-01T00:00:00.000Z", + agentId: "default", + settings: { + repos: ["Signet-AI/signetai"], + resourceTypes: ["issues"], + maxItemsPerRepo: 1, + }, + }; + saveSourcesConfig({ version: 1, sources: [source] }, agentsDir); + + globalThis.fetch = mock(async (input: string | URL | Request) => { + const url = String(input); + if (url.endsWith("/repos/Signet-AI/signetai")) { + return new Response( + JSON.stringify({ + owner: { login: "Signet-AI" }, + name: "signetai", + full_name: "Signet-AI/signetai", + default_branch: "main", + html_url: "https://github.com/Signet-AI/signetai", + }), + { status: 200 }, + ); + } + if (url.includes("/repos/Signet-AI/signetai/issues?")) { + return new Response(JSON.stringify({ message: "unprocessable" }), { status: 422 }); + } + throw new Error(`Unexpected fetch: ${url}`); + }) as typeof fetch; + + const bridge = startGitHubSourceBridge([source], { agentsDir, pollIntervalMs: 0 }); + try { + const indexed = await bridge.sync(); + expect(indexed).toBe(0); + } finally { + await bridge.close(); + } + + const saved = loadSourcesConfig(agentsDir).sources[0]; + expect(saved?.lastIndexedAt).toBeUndefined(); + expect(getSourceIndexJob(source.id)).toMatchObject({ status: "error" }); + }); + + test("syncs enabled github sources for non-default agent ids instead of silently skipping them", async () => { + const agentsDir = mkdtempSync(join(tmpdir(), "signet-github-bridge-agent-scope-")); + mkdirSync(agentsDir, { recursive: true }); + initDbAccessor(join(agentsDir, "memories.db")); + const sources: SignetSourceEntry[] = [ + { + id: "github:default", + kind: "github", + name: "Default Repo", + root: "", + enabled: true, + mode: "read-only", + createdAt: "2026-01-01T00:00:00.000Z", + updatedAt: "2026-01-01T00:00:00.000Z", + agentId: "default", + settings: { + repos: ["Signet-AI/default-repo"], + resourceTypes: ["issues"], + maxItemsPerRepo: 1, + }, + }, + { + id: "github:agent-b", + kind: "github", + name: "Agent B Repo", + root: "", + enabled: true, + mode: "read-only", + createdAt: "2026-01-01T00:00:00.000Z", + updatedAt: "2026-01-01T00:00:00.000Z", + agentId: "agent-b", + settings: { + repos: ["Signet-AI/agent-b-repo"], + resourceTypes: ["issues"], + maxItemsPerRepo: 1, + }, + }, + ]; + saveSourcesConfig({ version: 1, sources }, agentsDir); + + const seenRepos: string[] = []; + globalThis.fetch = mock(async (input: string | URL | Request) => { + const url = String(input); + if (url.endsWith("/repos/Signet-AI/default-repo")) { + seenRepos.push("default"); + return new Response( + JSON.stringify({ + owner: { login: "Signet-AI" }, + name: "default-repo", + full_name: "Signet-AI/default-repo", + default_branch: "main", + html_url: "https://github.com/Signet-AI/default-repo", + }), + { status: 200 }, + ); + } + if (url.endsWith("/repos/Signet-AI/agent-b-repo")) { + seenRepos.push("agent-b"); + return new Response( + JSON.stringify({ + owner: { login: "Signet-AI" }, + name: "agent-b-repo", + full_name: "Signet-AI/agent-b-repo", + default_branch: "main", + html_url: "https://github.com/Signet-AI/agent-b-repo", + }), + { status: 200 }, + ); + } + if (url.includes("/issues?")) { + return new Response(JSON.stringify([]), { status: 200 }); + } + throw new Error(`Unexpected fetch: ${url}`); + }) as typeof fetch; + + const bridge = startGitHubSourceBridge(sources, { agentsDir, pollIntervalMs: 0, agentId: "default" }); + try { + const indexed = await bridge.sync(); + expect(indexed).toBe(0); + } finally { + await bridge.close(); + } + + expect(seenRepos).toEqual(["default", "agent-b"]); + expect(getSourceIndexJob("github:default")).toMatchObject({ status: "complete" }); + expect(getSourceIndexJob("github:agent-b")).toMatchObject({ status: "complete" }); + }); + + test("marks the source job failed when discussions are requested without a token", async () => { + const agentsDir = mkdtempSync(join(tmpdir(), "signet-github-bridge-discussions-token-")); + mkdirSync(agentsDir, { recursive: true }); + initDbAccessor(join(agentsDir, "memories.db")); + const source: SignetSourceEntry = { + id: "github:discussions-no-token", + kind: "github", + name: "Discussion Repo", + root: "", + enabled: true, + mode: "read-only", + createdAt: "2026-01-01T00:00:00.000Z", + updatedAt: "2026-01-01T00:00:00.000Z", + agentId: "default", + settings: { + repos: ["Signet-AI/signetai"], + resourceTypes: ["discussions"], + maxItemsPerRepo: 1, + }, + }; + saveSourcesConfig({ version: 1, sources: [source] }, agentsDir); + + globalThis.fetch = mock(async (input: string | URL | Request) => { + const url = String(input); + if (url.endsWith("/repos/Signet-AI/signetai")) { + return new Response( + JSON.stringify({ + owner: { login: "Signet-AI" }, + name: "signetai", + full_name: "Signet-AI/signetai", + default_branch: "main", + html_url: "https://github.com/Signet-AI/signetai", + }), + { status: 200 }, + ); + } + throw new Error(`Unexpected fetch: ${url}`); + }) as typeof fetch; + + const bridge = startGitHubSourceBridge([source], { agentsDir, pollIntervalMs: 0 }); + try { + const indexed = await bridge.sync(); + expect(indexed).toBe(0); + } finally { + await bridge.close(); + } + + const saved = loadSourcesConfig(agentsDir).sources[0]; + expect(saved?.lastIndexedAt).toBeUndefined(); + expect(getSourceIndexJob(source.id)).toMatchObject({ status: "error" }); + }); +}); diff --git a/platform/daemon/src/github-source-bridge.ts b/platform/daemon/src/github-source-bridge.ts new file mode 100644 index 000000000..796e52591 --- /dev/null +++ b/platform/daemon/src/github-source-bridge.ts @@ -0,0 +1,513 @@ +import { homedir } from "node:os"; +import type { GitHubSourceSettings, SignetSourceEntry } from "@signet/core"; +import { loadSourcesConfig, markSourceIndexed, parseGitHubSettings } from "@signet/core"; +import { resolveDaemonAgentId } from "./agent-id"; +import { yieldEvery } from "./async-yield"; +import { getDbAccessor } from "./db-accessor"; +import { indexGitHubSourceEmbeddings, purgeGitHubSourceEmbeddings } from "./github-source-embeddings"; +import type { GitHubResource } from "./github-source-fetch"; +import { + type GitHubFetchConfig, + expandRepoGlob, + fetchDiscussionComments, + fetchDiscussions, + fetchIssueComments, + fetchIssues, + fetchPullRequestComments, + fetchPullRequests, + fetchPullRequestsBySearch, + fetchRepoDocs, + fetchRepoInfo, +} from "./github-source-fetch"; +import { indexGitHubSourceStructure, purgeGitHubSourceStructure } from "./github-source-graph"; +import { logger } from "./logger"; +import type { EmbeddingConfig } from "./memory-config"; +import type { SourceEmbeddingFetch } from "./obsidian-source-embeddings"; +import { getSecret } from "./secrets"; +import { + beginSourceIndexJob, + clearSourceIndexInFlight, + completeSourceIndexJob, + failSourceIndexJob, + isSourceIndexInFlight, + markSourceIndexInFlight, + markSourceIndexJobRunning, +} from "./source-index-progress"; + +export interface GitHubSourceBridgeHandle { + readonly sync: () => Promise; + readonly close: () => Promise; +} + +export interface GitHubSourceBridgeOptions { + readonly agentId?: string; + readonly pollIntervalMs?: number; + readonly embeddingConfig?: EmbeddingConfig; + readonly fetchEmbedding?: SourceEmbeddingFetch; + readonly agentsDir?: string; + readonly sourceActiveCheck?: () => boolean; +} + +interface ResolvedRepo { + readonly owner: string; + readonly repo: string; + readonly fullName: string; + defaultBranch: string; +} + +export async function resolveRepos(settings: GitHubSourceSettings, token?: string): Promise { + const resolved: ResolvedRepo[] = []; + for (const pattern of settings.repos) { + const [owner, repoPart] = pattern.split("/"); + if (!owner || !repoPart) continue; + if (repoPart === "*" || repoPart.includes("*")) { + const expanded = await expandRepoGlob(owner, repoPart, token, settings.maxItemsPerRepo); + if (expanded.truncated) { + logger.warn("github-source", "Wildcard repo source expansion hit configured cap", { + owner, + pattern: repoPart, + limit: settings.maxItemsPerRepo, + matchedRepos: expanded.repos.length, + }); + } + for (const fullName of expanded.repos) { + const [o, r] = fullName.split("/"); + if (!o || !r) continue; + resolved.push({ owner: o, repo: r, fullName, defaultBranch: "main" }); + } + } else { + resolved.push({ owner, repo: repoPart, fullName: `${owner}/${repoPart}`, defaultBranch: "main" }); + } + } + for (const repo of resolved) { + const info = await fetchRepoInfo({ owner: repo.owner, repo: repo.repo, token }); + if (info) { + repo.defaultBranch = info.defaultBranch; + } + } + return resolved; +} + +export interface GitHubSourceSyncResult { + readonly indexed: number; + readonly hadErrors: boolean; +} + +export async function syncGitHubSource( + source: SignetSourceEntry, + options: GitHubSourceBridgeOptions = {}, +): Promise { + const agentId = options.agentId ?? resolveDaemonAgentId(); + const settings = parseGitHubSettings(source.settings); + if (settings.repos.length === 0) { + logger.warn("github-source", "Source has no repos — skipping. Settings may be malformed.", { + sourceId: source.id, + hasSettings: !!source.settings, + }); + return { indexed: 0, hadErrors: false }; + } + const token = settings.tokenRef ? await resolveToken(settings.tokenRef, options.agentsDir) : undefined; + let totalIndexed = 0; + let hadErrors = false; + + const repos = await resolveRepos(settings, token); + logger.info("github-source", "Starting GitHub source sync", { + sourceId: source.id, + repoCount: repos.length, + resourceTypes: settings.resourceTypes, + }); + + const agentsDir = options.agentsDir ?? process.env.SIGNET_PATH ?? `${homedir()}/.agents`; + const isSourceActive = (): boolean => + loadSourcesConfig(agentsDir).sources.some((s) => s.id === source.id && s.enabled && s.agentId === agentId); + const syncOpts: GitHubSourceBridgeOptions = { ...options, sourceActiveCheck: isSourceActive }; + + for (const repo of repos) { + const config: GitHubFetchConfig = { owner: repo.owner, repo: repo.repo, token }; + const yielder = yieldEvery(5); + let repoIndexed = 0; + const seenKeys = new Set(); + const completeTypes = new Set(); + + try { + if (!isSourceActive()) { + logger.info("github-source", "Source removed during sync, aborting", { sourceId: source.id }); + break; + } + let commentFetchFailed = false; + if (settings.resourceTypes.includes("issues")) { + const result = await fetchIssues(config, undefined, settings.state, settings.maxItemsPerRepo, settings.labels); + if (!isSourceActive()) break; + const capped = result.resources.length >= settings.maxItemsPerRepo; + for (const resource of result.resources) { + seenKeys.add(resourceKey(resource)); + let comments: { author: string | null; body: string; createdAt: string }[] | undefined; + if (settings.includeComments && resource.commentsCount > 0) { + try { + const rawComments = await fetchIssueComments(config, resource.number ?? 0); + comments = rawComments.map((c) => ({ + author: c.user?.login ?? null, + body: c.body, + createdAt: c.created_at, + })); + } catch (err) { + commentFetchFailed = true; + hadErrors = true; + logCommentFetchError(source.id, repo.fullName, resource, err); + } + } + await indexResource(source.id, repo.fullName, resource, comments, agentId, syncOpts); + repoIndexed++; + await yielder(); + } + logErrors(source.id, repo.fullName, "issues", result.resources.length, result.errors); + if (result.errors.length > 0) hadErrors = true; + if (!capped && result.errors.length === 0 && !commentFetchFailed) completeTypes.add("issues"); + } + + if (settings.resourceTypes.includes("pulls")) { + const hasLabels = settings.labels && settings.labels.length > 0; + const result = hasLabels + ? await fetchPullRequestsBySearch( + config, + settings.labels, + undefined, + settings.state, + settings.maxItemsPerRepo, + ) + : await fetchPullRequests(config, undefined, settings.state, settings.maxItemsPerRepo); + if (!isSourceActive()) break; + const capped = result.resources.length >= settings.maxItemsPerRepo; + for (const resource of result.resources) { + seenKeys.add(resourceKey(resource)); + let comments: { author: string | null; body: string; createdAt: string }[] | undefined; + if (settings.includeComments && resource.commentsCount > 0) { + try { + const issueComments = await fetchIssueComments(config, resource.number ?? 0); + const reviewComments = await fetchPullRequestComments(config, resource.number ?? 0); + comments = [...issueComments, ...reviewComments].map((c) => ({ + author: c.user?.login ?? null, + body: c.body, + createdAt: c.created_at, + })); + } catch (err) { + commentFetchFailed = true; + hadErrors = true; + logCommentFetchError(source.id, repo.fullName, resource, err); + } + } + await indexResource(source.id, repo.fullName, resource, comments, agentId, syncOpts); + repoIndexed++; + await yielder(); + } + logErrors(source.id, repo.fullName, "pulls", result.resources.length, result.errors); + if (result.errors.length > 0) hadErrors = true; + if (!capped && result.errors.length === 0 && !commentFetchFailed) completeTypes.add("pulls"); + } + + if (settings.resourceTypes.includes("discussions")) { + if (!config.token) { + logger.warn("github-source", "Discussions require a token (GraphQL API) — skipping", { + sourceId: source.id, + repo: repo.fullName, + }); + hadErrors = true; + } else { + const result = await fetchDiscussions(config, undefined, settings.state, settings.maxItemsPerRepo); + if (!isSourceActive()) break; + const capped = result.resources.length >= settings.maxItemsPerRepo; + const labelSet = settings.labels?.length ? new Set(settings.labels) : null; + for (const resource of result.resources) { + if (labelSet && !resource.labels.some((l) => labelSet.has(l))) continue; + seenKeys.add(resourceKey(resource)); + let comments: { author: string | null; body: string; createdAt: string }[] | undefined; + if (settings.includeComments && resource.commentsCount > 0) { + try { + const rawComments = await fetchDiscussionComments(config, resource.number ?? 0); + comments = rawComments.map((c) => ({ + author: typeof c.author === "string" ? c.author : c.author?.login ?? null, + body: c.body, + createdAt: c.created_at, + })); + } catch (err) { + commentFetchFailed = true; + hadErrors = true; + logCommentFetchError(source.id, repo.fullName, resource, err); + } + } + await indexResource(source.id, repo.fullName, resource, comments, agentId, syncOpts); + repoIndexed++; + await yielder(); + } + logErrors(source.id, repo.fullName, "discussions", result.resources.length, result.errors); + if (result.errors.length > 0) hadErrors = true; + if (!capped && result.errors.length === 0 && !commentFetchFailed) completeTypes.add("discussions"); + } + } + + if (settings.resourceTypes.includes("docs")) { + const docPaths = settings.docPaths ?? ["README.md", "CHANGELOG.md"]; + const result = await fetchRepoDocs(config, docPaths, repo.defaultBranch, settings.maxItemsPerRepo); + if (!isSourceActive()) break; + const capped = result.resources.length >= settings.maxItemsPerRepo; + for (const resource of result.resources) { + seenKeys.add(resourceKey(resource)); + await indexResource(source.id, repo.fullName, resource, undefined, agentId, syncOpts); + repoIndexed++; + await yielder(); + } + logErrors(source.id, repo.fullName, "docs", result.resources.length, result.errors); + if (result.errors.length > 0) hadErrors = true; + if (!capped && result.errors.length === 0) completeTypes.add("docs"); + } + + await reconcileStaleResources(source.id, repo.fullName, seenKeys, completeTypes, agentId); + } catch (err) { + hadErrors = true; + logger.warn("github-source", "Failed to sync repo", { + sourceId: source.id, + repo: repo.fullName, + error: err instanceof Error ? err.message : String(err), + }); + } + + logger.info("github-source", "Repo sync complete", { + sourceId: source.id, + repo: repo.fullName, + indexed: repoIndexed, + }); + totalIndexed += repoIndexed; + } + return { indexed: totalIndexed, hadErrors }; +} + +async function indexResource( + sourceId: string, + repo: string, + resource: GitHubResource, + comments: { author: string | null; body: string; createdAt: string }[] | undefined, + agentId: string, + options: GitHubSourceBridgeOptions, +): Promise { + if (options.sourceActiveCheck && !options.sourceActiveCheck()) { + throw new Error(`Source ${sourceId} removed during sync`); + } + indexGitHubSourceStructure({ + agentId, + sourceId, + sourceName: repo, + repo, + resource, + }); + + if (options.embeddingConfig && options.fetchEmbedding) { + await indexGitHubSourceEmbeddings({ + agentId, + sourceId, + repo, + resource, + comments: comments?.map((c) => ({ author: c.author, body: c.body, createdAt: c.createdAt })), + embeddingConfig: options.embeddingConfig, + fetchEmbedding: options.fetchEmbedding, + sourceActiveCheck: options.sourceActiveCheck, + }); + } +} + +async function resolveToken(tokenRef: string, _agentsDir?: string): Promise { + try { + return await getSecret(tokenRef); + } catch (err) { + throw new Error(`Failed to resolve token ref '${tokenRef}': ${err instanceof Error ? err.message : String(err)}`); + } +} + +function logErrors( + sourceId: string, + repo: string, + type: string, + count: number, + errors: readonly { message: string }[], +): void { + if (errors.length > 0) { + logger.warn("github-source", `Errors during ${type} fetch`, { + sourceId, + repo, + type, + fetched: count, + errors: errors.length, + }); + } +} + +function logCommentFetchError(sourceId: string, repo: string, resource: GitHubResource, err: unknown): void { + logger.warn("github-source", "Comment fetch failed", { + sourceId, + repo, + type: resource.type, + number: resource.number, + path: resource.path, + error: err instanceof Error ? err.message : String(err), + }); +} + +function resourceKey(resource: GitHubResource): string { + if (resource.type === "doc" && resource.path) return `docs:${resource.path}`; + return `${resource.type}:${resource.number}`; +} + +function resourceTypePlural(type: string): string { + switch (type) { + case "issue": + return "issues"; + case "pull": + return "pulls"; + case "discussion": + return "discussions"; + case "doc": + return "docs"; + case "docs": + return "docs"; + default: + return type; + } +} + +function sourcePathToLocalKey(sourcePath: string, repo: string): { localKey: string; rawType: string } | null { + const key = sourcePath.startsWith("github:") ? sourcePath.slice("github:".length) : sourcePath; + const repoPrefix = `${repo}:`; + if (!key.startsWith(repoPrefix)) return null; + const localKey = key.slice(repoPrefix.length); + const rawType = localKey.split(":")[0] ?? ""; + return { localKey, rawType }; +} + +async function reconcileStaleResources( + sourceId: string, + repo: string, + seenKeys: Set, + completeTypes: Set, + agentId: string, +): Promise { + if (completeTypes.size === 0) return; + const { purgeGitHubResourceEmbeddings } = await import("./github-source-embeddings"); + const { purgeGitHubResourceStructure } = await import("./github-source-graph"); + const db = getDbAccessor(); + const rows = db.withReadDb((d) => + d + .prepare( + "SELECT source_path FROM entities WHERE source_id = ? AND agent_id = ? AND entity_type = 'source_document'", + ) + .all(sourceId, agentId), + ) as Array<{ source_path: string }>; + let purged = 0; + for (const row of rows) { + const parsed = sourcePathToLocalKey(row.source_path, repo); + if (!parsed) continue; + const { localKey, rawType } = parsed; + if (seenKeys.has(localKey)) continue; + if (!completeTypes.has(resourceTypePlural(rawType === "docs" ? "doc" : rawType))) continue; + const isDoc = rawType === "docs"; + const type = isDoc ? "doc" : rawType; + const numOrPath = localKey.slice(rawType.length + 1); + const resource: GitHubResource = { + type: type as GitHubResource["type"], + number: !isDoc ? Number(numOrPath) || 0 : undefined, + path: isDoc ? numOrPath : undefined, + title: "", + body: "", + state: "", + labels: [], + author: null, + createdAt: "", + updatedAt: "", + closedAt: null, + mergedAt: null, + commentsCount: 0, + extra: {}, + }; + purgeGitHubResourceEmbeddings({ sourceId, repo, agentId, resource }); + purgeGitHubResourceStructure({ sourceId, repo, agentId, resource }); + purged++; + } + if (purged > 0) { + logger.info("github-source", "Reconciled stale resources", { sourceId, repo, purged }); + } +} + +export function startGitHubSourceBridge( + sourcesOrLoader: readonly SignetSourceEntry[] | (() => readonly SignetSourceEntry[]), + options: GitHubSourceBridgeOptions = {}, +): GitHubSourceBridgeHandle { + const loadSources = typeof sourcesOrLoader === "function" ? sourcesOrLoader : () => sourcesOrLoader; + let syncInFlight: Promise | null = null; + + const sync = async (): Promise => { + if (syncInFlight) return syncInFlight; + syncInFlight = (async () => { + let total = 0; + const sources = loadSources(); + for (const source of sources) { + if (!source.enabled || source.kind !== "github") continue; + const sourceAgentId = source.agentId ?? "default"; + if (isSourceIndexInFlight(source.id)) continue; + const job = beginSourceIndexJob(source.id, "github-source-sync"); + markSourceIndexInFlight(source.id); + if (!markSourceIndexJobRunning(source.id, job.id)) { + clearSourceIndexInFlight(source.id); + continue; + } + try { + const result = await syncGitHubSource(source, { ...options, agentId: sourceAgentId }); + if (result.hadErrors) { + failSourceIndexJob(source.id, job.id, "GitHub source sync completed with partial errors"); + } else { + markSourceIndexed(source.id, undefined, options.agentsDir); + completeSourceIndexJob(source.id, job.id, result.indexed); + } + total += result.indexed; + } catch (err) { + failSourceIndexJob(source.id, job.id, err); + logger.warn("github-source", "Source sync failed", { + sourceId: source.id, + error: err instanceof Error ? err.message : String(err), + }); + } finally { + clearSourceIndexInFlight(source.id); + } + } + return total; + })().finally(() => { + syncInFlight = null; + }); + return syncInFlight; + }; + + const pollIntervalMs = options.pollIntervalMs ?? 300_000; + const pollTimer = + pollIntervalMs > 0 + ? setInterval(() => { + sync().catch((err) => { + logger.warn("github-source", "Polling sync failed", { + error: err instanceof Error ? err.message : String(err), + }); + }); + }, pollIntervalMs) + : null; + pollTimer?.unref?.(); + + return { + sync, + async close(): Promise { + if (pollTimer) clearInterval(pollTimer); + if (syncInFlight) await syncInFlight.catch(() => 0); + }, + }; +} + +export function purgeGitHubSource(sourceId: string, agentId?: string): number { + const id = agentId ?? resolveDaemonAgentId(); + const embeddings = purgeGitHubSourceEmbeddings({ sourceId, agentId: id }); + const structure = purgeGitHubSourceStructure({ sourceId, agentId: id }); + return embeddings + structure; +} diff --git a/platform/daemon/src/github-source-embeddings.test.ts b/platform/daemon/src/github-source-embeddings.test.ts new file mode 100644 index 000000000..4d0283dc3 --- /dev/null +++ b/platform/daemon/src/github-source-embeddings.test.ts @@ -0,0 +1,139 @@ +import { afterEach, describe, expect, test } from "bun:test"; +import { mkdtempSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { buildGitHubSourceChunks } from "./github-source-embeddings"; +import { indexGitHubSourceEmbeddings } from "./github-source-embeddings"; +import { closeDbAccessor, getDbAccessor, initDbAccessor } from "./db-accessor"; +import type { GitHubResource } from "./github-source-fetch"; + +function makeIssue(overrides: Partial = {}): GitHubResource { + return { + type: "issue", + number: 42, + title: "Fix native bundle installer 404", + body: "The install.sh script fetches from `bundle-latest` tag which does not exist yet.", + state: "open", + labels: ["bug", "priority:high"], + author: "nicholai", + createdAt: "2026-05-10T00:00:00Z", + updatedAt: "2026-05-17T00:00:00Z", + closedAt: null, + mergedAt: null, + commentsCount: 2, + extra: { html_url: "https://github.com/Signet-AI/signetai/issues/42" }, + ...overrides, + }; +} + +afterEach(() => { + closeDbAccessor(); +}); + +describe("buildGitHubSourceChunks", () => { + test("produces chunks from an issue", () => { + const resource = makeIssue(); + const chunks = buildGitHubSourceChunks({ + sourceId: "github:abc123", + repo: "Signet-AI/signetai", + resource, + }); + expect(chunks.length).toBeGreaterThan(0); + expect(chunks[0]?.id).toContain("github:abc123"); + expect(chunks[0]?.id).toContain("Signet-AI/signetai"); + expect(chunks[0]?.id).toContain("issue:42"); + expect(chunks[0]?.chunkText).toContain("source_id: github:abc123"); + expect(chunks[0]?.chunkText).toContain("repo: Signet-AI/signetai"); + }); + + test("produces chunks from a PR with comments", () => { + const resource = makeIssue({ type: "pull", number: 123, title: "Add GitHub source connector" }); + const comments = [ + { author: "alexmondello", body: "Looks good, just one nit.", createdAt: "2026-05-12T00:00:00Z" }, + { author: "nicholai", body: "Fixed, pushing now.", createdAt: "2026-05-12T01:00:00Z" }, + ]; + const chunks = buildGitHubSourceChunks({ + sourceId: "github:abc123", + repo: "Signet-AI/signetai", + resource, + comments, + }); + expect(chunks.length).toBeGreaterThan(0); + const allText = chunks.map((c) => c.chunkText).join(" "); + expect(allText).toContain("alexmondello"); + expect(allText).toContain("nicholai"); + }); + + test("produces chunks from a doc", () => { + const resource: GitHubResource = { + type: "doc", + path: "README.md", + title: "README", + body: "# Signet AI\n\nThis is the signet project. It does stuff.\n\n## Installation\n\nRun `bun add -g signetai`.", + state: "open", + labels: [], + author: null, + createdAt: "", + updatedAt: "", + closedAt: null, + mergedAt: null, + commentsCount: 0, + extra: { path: "README.md" }, + }; + const chunks = buildGitHubSourceChunks({ + sourceId: "github:abc123", + repo: "Signet-AI/signetai", + resource, + }); + expect(chunks.length).toBeGreaterThan(0); + expect(chunks[0]?.id).toContain("docs:README.md"); + }); + + test("returns minimal chunk for empty body", () => { + const resource = makeIssue({ body: "" }); + const chunks = buildGitHubSourceChunks({ + sourceId: "github:abc123", + repo: "Signet-AI/signetai", + resource, + }); + expect(chunks.length).toBe(1); + expect(chunks[0]?.chunkText).toContain("source_id: github:abc123"); + }); + + test("splits long content into multiple chunks", () => { + const longBody = "x".repeat(5000); + const resource = makeIssue({ body: longBody }); + const chunks = buildGitHubSourceChunks({ + sourceId: "github:abc123", + repo: "Signet-AI/signetai", + resource, + }); + expect(chunks.length).toBeGreaterThan(1); + }); + + test("aborts before writing embeddings when the source becomes inactive", async () => { + const dir = mkdtempSync(join(tmpdir(), "signet-github-embeddings-")); + initDbAccessor(join(dir, "memories.db")); + let active = true; + + await expect( + indexGitHubSourceEmbeddings({ + agentId: "default", + sourceId: "github:abc123", + repo: "Signet-AI/signetai", + resource: makeIssue(), + embeddingConfig: { provider: "openai", model: "text-embedding-3-small", dimensions: 3 }, + fetchEmbedding: async () => { + active = false; + return [0.1, 0.2, 0.3]; + }, + sourceActiveCheck: () => active, + }), + ).rejects.toThrow("removed during embedding sync"); + + const count = getDbAccessor().withReadDb( + (db) => (db.prepare("SELECT COUNT(*) AS count FROM embeddings").get() as { count: number }).count, + ); + expect(count).toBe(0); + }); +}); diff --git a/platform/daemon/src/github-source-embeddings.ts b/platform/daemon/src/github-source-embeddings.ts new file mode 100644 index 000000000..67a1c8641 --- /dev/null +++ b/platform/daemon/src/github-source-embeddings.ts @@ -0,0 +1,341 @@ +import { createHash } from "node:crypto"; +import { yieldEvery } from "./async-yield"; +import { getDbAccessor } from "./db-accessor"; +import { syncVecDeleteByEmbeddingIds, syncVecInsert, vectorToBlob } from "./db-helpers"; +import type { GitHubResource } from "./github-source-fetch"; +import { resourceToMarkdown } from "./github-source-fetch"; +import type { EmbeddingConfig } from "./memory-config"; +import type { SourceEmbeddingFetch } from "./obsidian-source-embeddings"; + +export const GITHUB_CHUNK_SOURCE_TYPE = "source_github_chunk"; +const GITHUB_SOURCE_CHUNK_DELAY_MS = 100; + +export interface GitHubSourceChunk { + readonly id: string; + readonly text: string; + readonly chunkText: string; + readonly heading: string; + readonly startLine: number; + readonly endLine: number; +} + +export interface IndexGitHubSourceEmbeddingsInput { + readonly agentId: string; + readonly sourceId: string; + readonly repo: string; + readonly resource: GitHubResource; + readonly comments?: readonly { author: string | null; body: string; createdAt: string }[]; + readonly embeddingConfig: EmbeddingConfig; + readonly fetchEmbedding: SourceEmbeddingFetch; + readonly sourceActiveCheck?: () => boolean; +} + +export interface IndexGitHubSourceEmbeddingsResult { + readonly chunks: number; + readonly embedded: number; + readonly skipped: number; +} + +const TARGET_CHARS = 1_600; +const MAX_CHARS = 2_200; +const MIN_CHARS = 40; + +function hash(input: string): string { + return createHash("sha256").update(input).digest("hex"); +} + +function resourceId(sourceId: string, repo: string, resource: GitHubResource): string { + if (resource.type === "doc" && resource.path) { + return `${sourceId}:${repo}:docs:${resource.path}`; + } + return `${sourceId}:${repo}:${resource.type}:${resource.number}`; +} + +export function buildGitHubSourceChunks(input: { + readonly sourceId: string; + readonly repo: string; + readonly resource: GitHubResource; + readonly comments?: readonly { author: string | null; body: string; createdAt: string }[]; +}): GitHubSourceChunk[] { + const markdown = resourceToMarkdown(input.resource, input.comments); + const prefix = resourceId(input.sourceId, input.repo, input.resource); + const sections = parseMarkdownSections(markdown); + const chunks: GitHubSourceChunk[] = []; + + for (const section of sections) { + const paragraphs = splitParagraphs(section.body); + let bucket = ""; + let chunkIndex = 0; + const flush = (): void => { + const trimmed = bucket.trim(); + if (trimmed.length < MIN_CHARS) { + bucket = ""; + return; + } + for (const piece of splitLongText(trimmed)) { + const headingKey = slug(section.heading) || "overview"; + const lineKey = `${section.startLine}-${section.endLine}`; + const chunkId = `${prefix}#${headingKey}:${lineKey}:${chunkIndex}`; + const chunkText = [ + `source_id: ${input.sourceId}`, + `repo: ${input.repo}`, + `type: ${input.resource.type}`, + input.resource.number != null ? `number: ${input.resource.number}` : `path: ${input.resource.path}`, + `heading: ${section.heading}`, + "", + piece, + ].join("\n"); + chunks.push({ + id: chunkId, + text: piece, + chunkText, + heading: section.heading, + startLine: section.startLine, + endLine: section.endLine, + }); + chunkIndex++; + } + bucket = ""; + }; + for (const paragraph of paragraphs) { + if (paragraph.length > MAX_CHARS) { + flush(); + for (const piece of splitLongText(paragraph)) { + bucket = piece; + flush(); + } + continue; + } + const candidate = bucket ? `${bucket}\n\n${paragraph}` : paragraph; + if (candidate.length > TARGET_CHARS) { + flush(); + bucket = paragraph; + } else { + bucket = candidate; + } + } + flush(); + } + return chunks; +} + +interface MarkdownSection { + readonly heading: string; + readonly startLine: number; + readonly endLine: number; + readonly body: string; +} + +function parseMarkdownSections(content: string): MarkdownSection[] { + const lines = content.replace(/\r\n?/g, "\n").split("\n"); + const sections: Array<{ heading: string; startLine: number; lines: string[] }> = []; + let current: { heading: string; startLine: number; lines: string[] } = { + heading: "Overview", + startLine: 1, + lines: [], + }; + + for (let idx = 0; idx < lines.length; idx++) { + const line = lines[idx] ?? ""; + const match = /^(#{1,6})\s+(.+?)\s*$/.exec(line); + if (match) { + const body = current.lines.join("\n").trim(); + if (body.length >= MIN_CHARS || current.heading !== "Overview") { + sections.push({ + heading: current.heading, + startLine: current.startLine, + endLine: current.startLine + current.lines.length, + body, + }); + } + current = { heading: match[2]?.trim() || "Untitled", startLine: idx + 1, lines: [] }; + continue; + } + current.lines.push(line); + } + const finalBody = current.lines.join("\n").trim(); + if (finalBody.length >= MIN_CHARS || current.heading !== "Overview") { + sections.push({ + heading: current.heading, + startLine: current.startLine, + endLine: current.startLine + current.lines.length, + body: finalBody, + }); + } + return sections.filter((s) => s.body.length >= MIN_CHARS); +} + +function splitParagraphs(body: string): string[] { + return body + .split(/\n{2,}/) + .map((part) => part.trim()) + .filter((part) => part.length > 0); +} + +function splitLongText(text: string): string[] { + if (text.length <= MAX_CHARS) return [text]; + const chunks: string[] = []; + for (let start = 0; start < text.length; start += TARGET_CHARS) { + chunks.push(text.slice(start, start + MAX_CHARS).trim()); + } + return chunks.filter((chunk) => chunk.length >= MIN_CHARS); +} + +function slug(input: string): string { + return input + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, "") + .slice(0, 80); +} + +function sleep(ms: number): Promise { + return ms > 0 ? new Promise((resolve) => setTimeout(resolve, ms)) : Promise.resolve(); +} + +function assertSourceActive(input: IndexGitHubSourceEmbeddingsInput): void { + if (input.sourceActiveCheck && !input.sourceActiveCheck()) { + throw new Error(`Source ${input.sourceId} removed during embedding sync`); + } +} + +export async function indexGitHubSourceEmbeddings( + input: IndexGitHubSourceEmbeddingsInput, +): Promise { + if (input.embeddingConfig.provider === "none") return { chunks: 0, embedded: 0, skipped: 0 }; + const chunks = buildGitHubSourceChunks(input); + const currentHashes = new Set(); + const yielder = yieldEvery(1); + let embedded = 0; + let skipped = 0; + const now = new Date().toISOString(); + + for (const chunk of chunks) { + assertSourceActive(input); + const contentHash = hash(`${input.agentId}\n${chunk.id}\n${chunk.chunkText}`); + currentHashes.add(contentHash); + if (existingChunkEmbeddingContentHash(input.agentId, chunk.id) === contentHash) { + skipped++; + await yielder(); + await sleep(GITHUB_SOURCE_CHUNK_DELAY_MS); + continue; + } + const vector = await input.fetchEmbedding(chunk.chunkText, input.embeddingConfig); + assertSourceActive(input); + if (!vector || vector.length === 0) { + skipped++; + await yielder(); + await sleep(GITHUB_SOURCE_CHUNK_DELAY_MS); + continue; + } + assertSourceActive(input); + getDbAccessor().withWriteTx((db) => { + const embId = hash(`${GITHUB_CHUNK_SOURCE_TYPE}:${input.agentId}:${chunk.id}`).slice(0, 32); + const existingForId = db.prepare("SELECT content_hash FROM embeddings WHERE id = ?").get(embId) as + | { content_hash: string } + | undefined; + if (existingForId && existingForId.content_hash !== contentHash) { + syncVecDeleteByEmbeddingIds(db, [embId]); + db.prepare("DELETE FROM embeddings WHERE id = ?").run(embId); + } + db.prepare( + `INSERT INTO embeddings + (id, content_hash, vector, dimensions, source_type, source_id, chunk_text, created_at, agent_id) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(content_hash) DO UPDATE SET + vector = excluded.vector, + dimensions = excluded.dimensions, + source_type = excluded.source_type, + source_id = excluded.source_id, + chunk_text = excluded.chunk_text, + created_at = excluded.created_at, + agent_id = excluded.agent_id`, + ).run( + embId, + contentHash, + vectorToBlob(vector), + vector.length, + GITHUB_CHUNK_SOURCE_TYPE, + chunk.id, + chunk.chunkText, + now, + input.agentId, + ); + const stored = db.prepare("SELECT id FROM embeddings WHERE content_hash = ?").get(contentHash) as + | { id: string } + | undefined; + syncVecInsert(db, stored?.id ?? embId, vector); + }); + embedded++; + await yielder(); + await sleep(GITHUB_SOURCE_CHUNK_DELAY_MS); + } + + const prefix = `${resourceId(input.sourceId, input.repo, input.resource)}#`; + assertSourceActive(input); + getDbAccessor().withWriteTx((db) => { + const stale = db + .prepare( + "SELECT id, content_hash FROM embeddings WHERE source_type = ? AND source_id >= ? AND source_id < ? AND agent_id = ?", + ) + .all(GITHUB_CHUNK_SOURCE_TYPE, prefix, `${prefix}\uffff`, input.agentId) as Array<{ + id: string; + content_hash: string; + }>; + const staleIds = stale.filter((row) => !currentHashes.has(row.content_hash)).map((row) => row.id); + if (staleIds.length > 0) { + syncVecDeleteByEmbeddingIds(db, staleIds); + const stmt = db.prepare("DELETE FROM embeddings WHERE id = ?"); + for (const id of staleIds) stmt.run(id); + } + }); + + return { chunks: chunks.length, embedded, skipped }; +} + +function existingChunkEmbeddingContentHash(agentId: string, chunkId: string): string | null { + const row = getDbAccessor().withReadDb((db) => + db + .prepare("SELECT content_hash FROM embeddings WHERE source_type = ? AND source_id = ? AND agent_id = ? LIMIT 1") + .get(GITHUB_CHUNK_SOURCE_TYPE, chunkId, agentId), + ) as { content_hash: string } | undefined; + return row?.content_hash ?? null; +} + +export function purgeGitHubSourceEmbeddings(input: { readonly sourceId: string; readonly agentId?: string }): number { + const prefix = `${input.sourceId}:`; + return getDbAccessor().withWriteTx((db) => { + const agentWhere = input.agentId ? " AND agent_id = ?" : ""; + const upper = `${prefix}\uffff`; + const args = input.agentId + ? [GITHUB_CHUNK_SOURCE_TYPE, prefix, upper, input.agentId] + : [GITHUB_CHUNK_SOURCE_TYPE, prefix, upper]; + const rows = db + .prepare(`SELECT id FROM embeddings WHERE source_type = ? AND source_id >= ? AND source_id < ?${agentWhere}`) + .all(...args) as Array<{ id: string }>; + const ids = rows.map((row) => row.id); + syncVecDeleteByEmbeddingIds(db, ids); + return db + .prepare(`DELETE FROM embeddings WHERE source_type = ? AND source_id >= ? AND source_id < ?${agentWhere}`) + .run(...args).changes; + }); +} + +export function purgeGitHubResourceEmbeddings(input: { + readonly sourceId: string; + readonly repo: string; + readonly agentId: string; + readonly resource: GitHubResource; +}): number { + const prefix = `${resourceId(input.sourceId, input.repo, input.resource)}#`; + return getDbAccessor().withWriteTx((db) => { + const rows = db + .prepare("SELECT id FROM embeddings WHERE source_type = ? AND source_id >= ? AND source_id < ? AND agent_id = ?") + .all(GITHUB_CHUNK_SOURCE_TYPE, prefix, `${prefix}\uffff`, input.agentId) as Array<{ id: string }>; + const ids = rows.map((row) => row.id); + syncVecDeleteByEmbeddingIds(db, ids); + return db + .prepare("DELETE FROM embeddings WHERE source_type = ? AND source_id >= ? AND source_id < ? AND agent_id = ?") + .run(GITHUB_CHUNK_SOURCE_TYPE, prefix, `${prefix}\uffff`, input.agentId).changes; + }); +} diff --git a/platform/daemon/src/github-source-fetch.test.ts b/platform/daemon/src/github-source-fetch.test.ts new file mode 100644 index 000000000..7970bd404 --- /dev/null +++ b/platform/daemon/src/github-source-fetch.test.ts @@ -0,0 +1,261 @@ +import { afterEach, describe, expect, mock, test } from "bun:test"; +import { expandRepoGlob, fetchDiscussions, fetchIssues, fetchPullRequestsBySearch, fetchRepoDocs } from "./github-source-fetch"; + +const originalFetch = globalThis.fetch; + +afterEach(() => { + globalThis.fetch = originalFetch; +}); + +describe("fetchRepoDocs", () => { + test("caps wildcard doc fetches to maxItemsPerRepo", async () => { + globalThis.fetch = mock(async (input: string | URL | Request) => { + const url = String(input); + if (url.includes("/git/trees/")) { + return new Response( + JSON.stringify({ + tree: [ + { path: "a.md", type: "blob" }, + { path: "b.md", type: "blob" }, + { path: "c.md", type: "blob" }, + ], + }), + { status: 200 }, + ); + } + const name = url.match(/\/contents\/docs\/([^?]+)/)?.[1]; + return new Response( + JSON.stringify({ + name, + sha: `sha-${name}`, + content: Buffer.from(`# ${name}`).toString("base64"), + }), + { status: 200 }, + ); + }) as typeof fetch; + + const result = await fetchRepoDocs( + { owner: "Signet-AI", repo: "signetai" }, + ["docs/**/*.md"], + "main", + 1, + ); + + expect(result.resources).toHaveLength(1); + expect(result.resources[0]?.path).toBe("docs/a.md"); + }); +}); + +describe("expandRepoGlob", () => { + test("caps wildcard repo expansion to the configured limit", async () => { + let requests = 0; + globalThis.fetch = mock(async (input: string | URL | Request) => { + requests++; + const url = new URL(String(input)); + expect(url.searchParams.get("per_page")).toBe("1"); + expect(url.searchParams.get("page")).toBe("1"); + return new Response(JSON.stringify([{ full_name: "Signet-AI/signetai", name: "signetai" }]), { + status: 200, + }); + }) as typeof fetch; + + const result = await expandRepoGlob("Signet-AI", "*", undefined, 1); + + expect(result.repos).toEqual(["Signet-AI/signetai"]); + expect(result.truncated).toBe(true); + expect(requests).toBe(1); + }); + + test("escapes regex metacharacters in repo globs before matching names", async () => { + globalThis.fetch = mock(async () => { + return new Response( + JSON.stringify([ + { full_name: "Signet-AI/private.repo", name: "private.repo" }, + { full_name: "Signet-AI/privateXrepo", name: "privateXrepo" }, + ]), + { status: 200 }, + ); + }) as typeof fetch; + + const result = await expandRepoGlob("Signet-AI", "private.*", undefined, 10); + + expect(result.repos).toEqual(["Signet-AI/private.repo"]); + }); +}); + +describe("fetchIssues", () => { + test("keeps fetching until it indexes the requested number of issues even when pages include PR rows", async () => { + let requests = 0; + globalThis.fetch = (async (input) => { + requests++; + const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url; + const page = Number(new URL(url).searchParams.get("page") ?? "1"); + const perPage = Number(new URL(url).searchParams.get("per_page") ?? "100"); + const rows = + page === 1 + ? [ + makeIssueRow(1, false), + ...Array.from({ length: 99 }, (_, index) => makeIssueRow(index + 2, true)), + ] + : [makeIssueRow(5, false)]; + return new Response( + JSON.stringify(rows.slice(0, perPage)), + { + status: 200, + headers: { + "content-type": "application/json", + "x-ratelimit-remaining": "4999", + "x-ratelimit-reset": "0", + }, + }, + ); + }) as typeof fetch; + + const result = await fetchIssues({ owner: "Signet-AI", repo: "signetai" }, undefined, "all", 2); + + expect(requests).toBe(2); + expect(result.resources.map((resource) => resource.number)).toEqual([1, 5]); + }); + + test("reports a bounded partial result when PR rows exhaust the scan budget", async () => { + let requests = 0; + globalThis.fetch = (async () => { + requests++; + return new Response(JSON.stringify(Array.from({ length: 100 }, (_, index) => makeIssueRow(index + 1, true))), { + status: 200, + headers: { + "content-type": "application/json", + "x-ratelimit-remaining": "4999", + "x-ratelimit-reset": "0", + }, + }); + }) as typeof fetch; + + const result = await fetchIssues({ owner: "Signet-AI", repo: "signetai" }, undefined, "all", 2); + + expect(requests).toBe(5); + expect(result.resources).toHaveLength(0); + expect(result.errors[0]?.message).toContain("scan budget exhausted"); + }); +}); + +describe("fetchDiscussions", () => { + test("filters discussions by requested state across pages", async () => { + let requests = 0; + globalThis.fetch = (async (_input, init) => { + requests++; + const body = JSON.parse(String(init?.body ?? "{}")) as { variables?: { after?: string | null } }; + const after = body.variables?.after ?? null; + return new Response(JSON.stringify(makeDiscussionPage(after)), { + status: 200, + headers: { + "content-type": "application/json", + "x-ratelimit-remaining": "4999", + "x-ratelimit-reset": "0", + }, + }); + }) as typeof fetch; + + const result = await fetchDiscussions({ owner: "Signet-AI", repo: "signetai" }, undefined, "closed", 2); + + expect(requests).toBe(2); + expect(result.resources).toHaveLength(2); + expect(result.resources.map((resource) => resource.number)).toEqual([2, 3]); + expect(result.resources.map((resource) => resource.state)).toEqual(["closed", "closed"]); + }); +}); + +describe("fetchPullRequestsBySearch", () => { + test("escapes label values before building the GitHub search query", async () => { + let capturedQuery = ""; + globalThis.fetch = (async (input) => { + const url = new URL(typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url); + capturedQuery = url.searchParams.get("q") ?? ""; + return new Response( + JSON.stringify({ + items: [], + total_count: 0, + }), + { + status: 200, + headers: { + "content-type": "application/json", + "x-ratelimit-remaining": "29", + "x-ratelimit-reset": "0", + }, + }, + ); + }) as typeof fetch; + + const result = await fetchPullRequestsBySearch( + { owner: "Signet-AI", repo: "signetai" }, + ['needs "quote" \\ slash'], + ); + + expect(result.resources).toEqual([]); + expect(capturedQuery).toContain('label:"needs \\"quote\\" \\\\ slash"'); + }); +}); + +function makeIssueRow(number: number, isPullRequest: boolean): Record { + return { + number, + title: `Issue ${number}`, + body: "", + state: "open", + html_url: `https://github.com/Signet-AI/signetai/issues/${number}`, + user: { login: "alexmondello" }, + labels: [], + assignees: [], + milestone: null, + created_at: "2026-05-23T00:00:00Z", + updated_at: "2026-05-23T00:00:00Z", + closed_at: null, + comments: 0, + ...(isPullRequest ? { pull_request: { url: `https://api.github.com/repos/Signet-AI/signetai/pulls/${number}` } } : {}), + }; +} + +function makeDiscussionPage(after: string | null): Record { + if (after === "cursor-1") { + return { + data: { + repository: { + discussions: { + pageInfo: { hasNextPage: false, endCursor: null }, + nodes: [ + makeDiscussionNode(2, "CLOSED", "2026-05-23T01:00:00Z"), + makeDiscussionNode(3, "CLOSED", "2026-05-23T00:30:00Z"), + ], + }, + }, + }, + }; + } + return { + data: { + repository: { + discussions: { + pageInfo: { hasNextPage: true, endCursor: "cursor-1" }, + nodes: [makeDiscussionNode(1, "OPEN", "2026-05-23T02:00:00Z")], + }, + }, + }, + }; +} + +function makeDiscussionNode(number: number, state: string, updatedAt: string): Record { + return { + number, + title: `Discussion ${number}`, + body: "", + closed: state === "CLOSED", + url: `https://github.com/Signet-AI/signetai/discussions/${number}`, + author: { login: "alexmondello" }, + labels: { nodes: [] }, + createdAt: updatedAt, + updatedAt, + answerId: null, + comments: { totalCount: 0 }, + }; +} diff --git a/platform/daemon/src/github-source-fetch.ts b/platform/daemon/src/github-source-fetch.ts new file mode 100644 index 000000000..622de854a --- /dev/null +++ b/platform/daemon/src/github-source-fetch.ts @@ -0,0 +1,856 @@ +import { logger } from "./logger"; + +export interface GitHubFetchConfig { + readonly token?: string; + readonly owner: string; + readonly repo: string; +} + +export interface GitHubIssue { + readonly number: number; + readonly title: string; + readonly body: string | null; + readonly state: string; + readonly html_url: string; + readonly user: { readonly login: string } | null; + readonly labels: readonly { readonly name: string; readonly color: string }[]; + readonly assignees: readonly { readonly login: string }[]; + readonly milestone: { readonly title: string } | null; + readonly created_at: string; + readonly updated_at: string; + readonly closed_at: string | null; + readonly pull_request?: { readonly url: string }; + readonly comments: number; +} + +export interface GitHubComment { + readonly id: number; + readonly body: string; + readonly user: { readonly login: string } | null; + readonly created_at: string; + readonly updated_at: string; +} + +export interface GitHubPullRequest { + readonly number: number; + readonly title: string; + readonly body: string | null; + readonly state: string; + readonly html_url: string; + readonly user: { readonly login: string } | null; + readonly labels: readonly { readonly name: string; readonly color: string }[]; + readonly assignees: readonly { readonly login: string }[]; + readonly milestone: { readonly title: string } | null; + readonly created_at: string; + readonly updated_at: string; + readonly closed_at: string | null; + readonly merged_at: string | null; + readonly draft: boolean; + readonly base: { readonly ref: string }; + readonly head: { readonly ref: string }; + readonly comments: number; + readonly review_comments: number; + readonly commits: number; + readonly changed_files: number; +} + +export interface GitHubSearchIssue { + readonly number: number; + readonly title: string; + readonly body: string | null; + readonly state: string; + readonly html_url: string; + readonly user: { readonly login: string } | null; + readonly labels: readonly ({ readonly name: string } | string)[]; + readonly created_at: string; + readonly updated_at: string; + readonly closed_at: string | null; + readonly comments: number; +} + +export interface GitHubDiscussion { + readonly number: number; + readonly title: string; + readonly body: string; + readonly state: string; + readonly url: string; + readonly author: { readonly login: string } | null; + readonly labels: readonly { readonly name: string }[]; + readonly created_at: string; + readonly updated_at: string; + readonly answer_id: number | null; + readonly comments_count: number; +} + +export interface GitHubDiscussionComment { + readonly id: number; + readonly body: string; + readonly author: { readonly login: string } | null; + readonly created_at: string; + readonly updated_at: string; + readonly is_answer: boolean; +} + +export interface GitHubRepoDoc { + readonly path: string; + readonly content: string; + readonly sha: string; +} + +export interface GitHubResource { + readonly type: "issue" | "pull" | "discussion" | "doc"; + readonly number?: number; + readonly path?: string; + readonly title: string; + readonly body: string; + readonly state: string; + readonly labels: readonly string[]; + readonly author: string | null; + readonly createdAt: string; + readonly updatedAt: string; + readonly closedAt: string | null; + readonly mergedAt: string | null; + readonly commentsCount: number; + readonly extra: Readonly>; +} + +export interface GitHubFetchResult { + readonly resources: readonly GitHubResource[]; + readonly rateLimitRemaining: number; + readonly rateLimitReset: number; + readonly errors: readonly { readonly message: string; readonly retryable: boolean }[]; +} + +export interface GitHubRepoInfo { + readonly owner: string; + readonly repo: string; + readonly fullName: string; + readonly description: string | null; + readonly defaultBranch: string; + readonly htmlUrl: string; +} + +const GITHUB_API_BASE = "https://api.github.com"; +const GRAPHQL_URL = "https://api.github.com/graphql"; +const PER_PAGE = 100; +const MAX_COMMENTS_PER_RESOURCE = 200; +const REQUEST_TIMEOUT_MS = 30_000; +const MAX_RETRIES = 3; +const RETRY_BASE_DELAY_MS = 1000; +const MAX_ISSUE_SCAN_MULTIPLIER = 5; +const MAX_ISSUE_SCAN_FLOOR = PER_PAGE * 5; +const MAX_ISSUE_SCAN_CEILING = PER_PAGE * 20; + +interface GitHubApiResponse { + readonly status: number; + readonly headers: Headers; + readonly body: unknown; +} + +interface RateLimitInfo { + readonly remaining: number; + readonly reset: number; +} + +function parseRateLimit(headers: Headers): RateLimitInfo { + return { + remaining: Number(headers.get("x-ratelimit-remaining") ?? "5000"), + reset: Number(headers.get("x-ratelimit-reset") ?? "0") * 1000, + }; +} + +async function githubRequest(url: string, token?: string, method = "GET", body?: unknown): Promise { + const headers: Record = { + Accept: "application/vnd.github.v3+json", + "User-Agent": "signet-daemon", + }; + if (token) headers.Authorization = `Bearer ${token}`; + if (body) headers["Content-Type"] = "application/json"; + + let lastError: Error | null = null; + for (let attempt = 0; attempt < MAX_RETRIES; attempt++) { + try { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS); + const response = await fetch(url, { + method, + headers, + body: body ? JSON.stringify(body) : undefined, + signal: controller.signal, + }); + clearTimeout(timeout); + const rateLimit = parseRateLimit(response.headers); + if (rateLimit.remaining < 10 && rateLimit.reset > Date.now()) { + const waitMs = rateLimit.reset - Date.now() + 1000; + logger.warn("github-source", "Approaching rate limit, backing off", { + remaining: rateLimit.remaining, + waitMs, + }); + await new Promise((resolve) => setTimeout(resolve, Math.min(waitMs, 60_000))); + } + if (response.status === 403 && rateLimit.remaining === 0) { + const waitMs = rateLimit.reset - Date.now() + 1000; + logger.warn("github-source", "Rate limit exhausted, waiting", { waitMs }); + await new Promise((resolve) => setTimeout(resolve, Math.min(waitMs, 60_000))); + continue; + } + if (response.status >= 500) { + lastError = new Error(`GitHub API ${response.status}: ${await response.text()}`); + await new Promise((resolve) => setTimeout(resolve, RETRY_BASE_DELAY_MS * (attempt + 1))); + continue; + } + return { + status: response.status, + headers: response.headers, + body: response.status === 204 ? null : await response.json(), + }; + } catch (err) { + lastError = err instanceof Error ? err : new Error(String(err)); + if (attempt < MAX_RETRIES - 1) { + await new Promise((resolve) => setTimeout(resolve, RETRY_BASE_DELAY_MS * (attempt + 1))); + } + } + } + throw lastError ?? new Error("GitHub API request failed after retries"); +} + +export async function fetchRepoInfo(config: GitHubFetchConfig): Promise { + const url = `${GITHUB_API_BASE}/repos/${config.owner}/${config.repo}`; + const response = await githubRequest(url, config.token); + if (response.status === 404) return null; + if (response.status !== 200) { + throw new Error(`Failed to fetch repo info: ${response.status}`); + } + const data = response.body as Record; + return { + owner: ((data.owner as Record | undefined)?.login as string) ?? config.owner, + repo: (data.name as string) ?? config.repo, + fullName: (data.full_name as string) ?? `${config.owner}/${config.repo}`, + description: (data.description as string) ?? null, + defaultBranch: (data.default_branch as string) ?? "main", + htmlUrl: (data.html_url as string) ?? `https://github.com/${config.owner}/${config.repo}`, + }; +} + +export interface RepoGlobExpansion { + readonly repos: readonly string[]; + readonly truncated: boolean; +} + +export async function expandRepoGlob( + owner: string, + pattern: string, + token?: string, + maxRepos = 500, +): Promise { + if (!pattern.includes("*")) return { repos: [`${owner}/${pattern}`], truncated: false }; + const regex = new RegExp(`^${globToRegexSource(pattern)}$`); + for (const prefix of [`/orgs/${owner}/repos`, `/users/${owner}/repos`]) { + const repos: Array<{ full_name: string; name: string }> = []; + let page = 1; + let truncated = false; + while (repos.length < maxRepos) { + const remaining = Math.max(1, maxRepos - repos.length); + const url = `${GITHUB_API_BASE}${prefix}?per_page=${Math.min(PER_PAGE, remaining)}&page=${page}&type=all`; + const response = await githubRequest(url, token); + if (response.status !== 200) break; + const batch = response.body as Array<{ full_name: string; name: string }>; + repos.push(...batch); + if (repos.length >= maxRepos) { + truncated = batch.length === Math.min(PER_PAGE, remaining); + break; + } + if (batch.length < PER_PAGE) break; + page++; + } + if (repos.length > 0) { + return { + repos: repos.filter((r) => regex.test(r.name)).map((r) => r.full_name).slice(0, maxRepos), + truncated, + }; + } + } + logger.warn("github-source", "Failed to expand repo glob", { owner }); + return { repos: [], truncated: false }; +} + +function globToRegexSource(pattern: string): string { + return pattern + .replace(/[|\\{}()[\]^$+?.]/g, "\\$&") + .replace(/\*/g, ".*") + .replace(/\\\?/g, "."); +} + +export async function fetchIssues( + config: GitHubFetchConfig, + since?: string, + state = "all", + maxItems = 500, + labels?: readonly string[], +): Promise { + const resources: GitHubResource[] = []; + const errors: { message: string; retryable: boolean }[] = []; + let rateLimitRemaining = 5000; + let rateLimitReset = 0; + let page = 1; + let fetched = 0; + let scanned = 0; + const maxScanned = Math.min( + Math.max(maxItems * MAX_ISSUE_SCAN_MULTIPLIER, MAX_ISSUE_SCAN_FLOOR), + MAX_ISSUE_SCAN_CEILING, + ); + + while (fetched < maxItems && scanned < maxScanned) { + const params = new URLSearchParams({ + state, + per_page: String(Math.min(PER_PAGE, maxScanned - scanned)), + sort: "updated", + direction: "desc", + page: String(page), + }); + if (since) params.set("since", since); + if (labels && labels.length > 0) { + params.set("labels", labels.join(",")); + } + const url = `${GITHUB_API_BASE}/repos/${config.owner}/${config.repo}/issues?${params}`; + const response = await githubRequest(url, config.token); + const rl = parseRateLimit(response.headers); + rateLimitRemaining = rl.remaining; + rateLimitReset = rl.reset; + if (response.status !== 200) { + errors.push({ message: `Issues fetch failed: ${response.status}`, retryable: response.status >= 500 }); + break; + } + const issues = response.body as GitHubIssue[]; + if (issues.length === 0) break; + for (const issue of issues) { + scanned++; + if (issue.pull_request) continue; + resources.push({ + type: "issue", + number: issue.number, + title: issue.title, + body: issue.body ?? "", + state: issue.state, + labels: issue.labels.map((l) => l.name), + author: issue.user?.login ?? null, + createdAt: issue.created_at, + updatedAt: issue.updated_at, + closedAt: issue.closed_at, + mergedAt: null, + commentsCount: issue.comments, + extra: { + milestone: issue.milestone?.title ?? null, + assignees: issue.assignees.map((a) => a.login), + html_url: issue.html_url, + }, + }); + fetched++; + if (fetched >= maxItems) break; + } + if (issues.length < PER_PAGE) break; + page++; + } + if (fetched < maxItems && scanned >= maxScanned) { + errors.push({ + message: `Issues fetch scan budget exhausted before reaching maxItems (${fetched}/${maxItems} indexed, ${scanned} scanned)`, + retryable: false, + }); + } + return { resources, rateLimitRemaining, rateLimitReset, errors }; +} + +export async function fetchIssueComments(config: GitHubFetchConfig, issueNumber: number): Promise { + const comments: GitHubComment[] = []; + let page = 1; + while (comments.length < MAX_COMMENTS_PER_RESOURCE) { + const url = `${GITHUB_API_BASE}/repos/${config.owner}/${config.repo}/issues/${issueNumber}/comments?per_page=${PER_PAGE}&page=${page}`; + const response = await githubRequest(url, config.token); + if (response.status !== 200) throw new Error(`Issue comments fetch failed for #${issueNumber}: ${response.status}`); + const batch = response.body as GitHubComment[]; + comments.push(...batch); + if (batch.length < PER_PAGE) break; + page++; + } + return comments.slice(0, MAX_COMMENTS_PER_RESOURCE); +} + +export async function fetchPullRequestComments( + config: GitHubFetchConfig, + pullNumber: number, +): Promise { + const comments: GitHubComment[] = []; + let page = 1; + while (comments.length < MAX_COMMENTS_PER_RESOURCE) { + const url = `${GITHUB_API_BASE}/repos/${config.owner}/${config.repo}/pulls/${pullNumber}/comments?per_page=${PER_PAGE}&page=${page}`; + const response = await githubRequest(url, config.token); + if (response.status !== 200) + throw new Error(`Pull request comments fetch failed for #${pullNumber}: ${response.status}`); + const batch = response.body as GitHubComment[]; + comments.push(...batch); + if (batch.length < PER_PAGE) break; + page++; + } + return comments.slice(0, MAX_COMMENTS_PER_RESOURCE); +} + +export async function fetchPullRequests( + config: GitHubFetchConfig, + since?: string, + state = "all", + maxItems = 500, +): Promise { + const resources: GitHubResource[] = []; + const errors: { message: string; retryable: boolean }[] = []; + let rateLimitRemaining = 5000; + let rateLimitReset = 0; + let page = 1; + let fetched = 0; + + while (fetched < maxItems) { + const params = new URLSearchParams({ + state, + per_page: String(Math.min(PER_PAGE, maxItems - fetched)), + sort: "updated", + direction: "desc", + page: String(page), + }); + if (since) params.set("since", since); + const url = `${GITHUB_API_BASE}/repos/${config.owner}/${config.repo}/pulls?${params}`; + const response = await githubRequest(url, config.token); + const rl = parseRateLimit(response.headers); + rateLimitRemaining = rl.remaining; + rateLimitReset = rl.reset; + if (response.status !== 200) { + errors.push({ message: `PRs fetch failed: ${response.status}`, retryable: response.status >= 500 }); + break; + } + const pulls = response.body as GitHubPullRequest[]; + if (pulls.length === 0) break; + for (const pull of pulls) { + if (since && pull.updated_at < since) { + break; + } + resources.push({ + type: "pull", + number: pull.number, + title: pull.title, + body: pull.body ?? "", + state: pull.state, + labels: pull.labels.map((l) => l.name), + author: pull.user?.login ?? null, + createdAt: pull.created_at, + updatedAt: pull.updated_at, + closedAt: pull.closed_at, + mergedAt: pull.merged_at, + commentsCount: pull.comments + pull.review_comments, + extra: { + draft: pull.draft, + base: pull.base.ref, + head: pull.head.ref, + commits: pull.commits, + changed_files: pull.changed_files, + milestone: pull.milestone?.title ?? null, + assignees: pull.assignees.map((a) => a.login), + html_url: pull.html_url, + }, + }); + fetched++; + } + if (pulls.length < PER_PAGE) break; + if (since && pulls[pulls.length - 1]?.updated_at < since) break; + page++; + } + return { resources, rateLimitRemaining, rateLimitReset, errors }; +} + +export async function fetchPullRequestsBySearch( + config: GitHubFetchConfig, + labels: readonly string[], + since?: string, + state = "all", + maxItems = 500, +): Promise { + const resources: GitHubResource[] = []; + const errors: { message: string; retryable: boolean }[] = []; + let rateLimitRemaining = 30; + let rateLimitReset = 0; + let page = 1; + + const labelQuery = labels.map((l) => `label:"${escapeGitHubSearchValue(l)}"`).join(" "); + const stateQuery = state === "all" ? "" : ` is:${state}`; + const q = `repo:${config.owner}/${config.repo} type:pr${stateQuery} ${labelQuery}`; + + while (resources.length < maxItems) { + const params = new URLSearchParams({ + q: q.trim(), + per_page: String(Math.min(PER_PAGE, maxItems - resources.length)), + sort: "updated", + order: "desc", + page: String(page), + }); + const url = `${GITHUB_API_BASE}/search/issues?${params}`; + const response = await githubRequest(url, config.token); + const rl = parseRateLimit(response.headers); + rateLimitRemaining = rl.remaining; + rateLimitReset = rl.reset; + if (response.status !== 200) { + errors.push({ message: `PR search fetch failed: ${response.status}`, retryable: response.status >= 500 }); + break; + } + const data = response.body as { items: GitHubSearchIssue[]; total_count: number }; + if (!data.items || data.items.length === 0) break; + for (const item of data.items) { + if (since && item.updated_at < since) break; + resources.push({ + type: "pull", + number: item.number, + title: item.title, + body: item.body ?? "", + state: item.state === "open" ? "open" : "closed", + labels: item.labels.map((l) => (typeof l === "string" ? l : l.name)), + author: item.user?.login ?? null, + createdAt: item.created_at, + updatedAt: item.updated_at, + closedAt: item.closed_at ?? null, + mergedAt: null, + commentsCount: item.comments, + extra: { html_url: item.html_url }, + }); + } + if (data.items.length < PER_PAGE) break; + if (data.total_count <= resources.length) break; + page++; + } + return { resources, rateLimitRemaining, rateLimitReset, errors }; +} + +function escapeGitHubSearchValue(value: string): string { + return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\r?\n/g, " ").trim(); +} + +export async function fetchDiscussions( + config: GitHubFetchConfig, + since?: string, + state = "all", + maxItems = 500, +): Promise { + const resources: GitHubResource[] = []; + const errors: { message: string; retryable: boolean }[] = []; + let rateLimitRemaining = 5000; + let rateLimitReset = 0; + + const query = ` + query($owner: String!, $repo: String!, $first: Int!, $after: String) { + repository(owner: $owner, name: $repo) { + discussions(first: $first, after: $after, orderBy: {field: UPDATED_AT, direction: DESC}) { + pageInfo { hasNextPage endCursor } + nodes { + number title body url closed + author { login } + labels(first: 20) { nodes { name } } + createdAt updatedAt + answerId: answer { id } + comments { totalCount } + } + } + } + }`; + let cursor: string | null = null; + let fetched = 0; + + while (fetched < maxItems) { + const variables = { + owner: config.owner, + repo: config.repo, + first: Math.min(100, maxItems - fetched), + after: cursor, + }; + const response = await githubRequest(GRAPHQL_URL, config.token, "POST", { query, variables }); + const rl = parseRateLimit(response.headers); + rateLimitRemaining = rl.remaining; + rateLimitReset = rl.reset; + if (response.status !== 200) { + const body = response.body as { message?: string } | null; + errors.push({ + message: `Discussions fetch failed: ${response.status} ${body?.message ?? ""}`, + retryable: response.status >= 500, + }); + break; + } + const data = response.body as { + errors?: Array<{ message: string }>; + data?: { + repository?: { + discussions?: { + pageInfo: { hasNextPage: boolean; endCursor: string | null }; + nodes: Array<{ + number: number; + title: string; + body: string; + closed: boolean; + url: string; + author: { login: string } | null; + labels: { nodes: Array<{ name: string }> }; + createdAt: string; + updatedAt: string; + answerId: { id: string } | null; + comments: { totalCount: number }; + }>; + }; + }; + }; + }; + if (data.errors && data.errors.length > 0) { + for (const gqlErr of data.errors) { + errors.push({ message: `GraphQL: ${gqlErr.message}`, retryable: false }); + } + break; + } + const discussions = data.data?.repository?.discussions; + if (!discussions?.nodes?.length) break; + for (const d of discussions.nodes) { + if (since && d.updatedAt < since) { + cursor = null; + break; + } + const discussionState = d.closed ? "closed" : "open"; + if (state !== "all" && discussionState !== state) continue; + resources.push({ + type: "discussion", + number: d.number, + title: d.title, + body: d.body ?? "", + state: discussionState, + labels: d.labels?.nodes?.map((l) => l.name) ?? [], + author: d.author?.login ?? null, + createdAt: d.createdAt, + updatedAt: d.updatedAt, + closedAt: null, + mergedAt: null, + commentsCount: d.comments?.totalCount ?? 0, + extra: { url: d.url, answer_id: d.answerId?.id ?? null }, + }); + fetched++; + if (fetched >= maxItems) break; + } + cursor = discussions.pageInfo.endCursor; + if (!discussions.pageInfo.hasNextPage || !cursor || fetched >= maxItems) break; + } + return { resources, rateLimitRemaining, rateLimitReset, errors }; +} + +export async function fetchDiscussionComments( + config: GitHubFetchConfig, + discussionNumber: number, +): Promise { + const query = ` + query($owner: String!, $repo: String!, $number: Int!, $after: String) { + repository(owner: $owner, name: $repo) { + discussion(number: $number) { + comments(first: 100, after: $after) { + pageInfo { hasNextPage endCursor } + nodes { + id body isAnswer + author { login } + createdAt updatedAt + } + } + } + } + }`; + const comments: GitHubDiscussionComment[] = []; + let cursor: string | null = null; + while (true) { + const variables = { owner: config.owner, repo: config.repo, number: discussionNumber, after: cursor }; + const response = await githubRequest(GRAPHQL_URL, config.token, "POST", { query, variables }); + if (response.status !== 200) + throw new Error(`Discussion comments fetch failed for #${discussionNumber}: ${response.status}`); + const data = response.body as { + data?: { + repository?: { + discussion?: { + comments?: { + pageInfo: { hasNextPage: boolean; endCursor: string | null }; + nodes: Array<{ + id: string; + body: string; + isAnswer: boolean; + author: { login: string } | null; + createdAt: string; + updatedAt: string; + }>; + }; + }; + }; + }; + }; + const nodes = data.data?.repository?.discussion?.comments?.nodes ?? []; + for (const c of nodes) { + comments.push({ + id: Number.parseInt(c.id.replace(/^DIC_/, ""), 10) || 0, + body: c.body, + author: c.author, + created_at: c.createdAt, + updated_at: c.updatedAt, + is_answer: c.isAnswer, + }); + } + const pageInfo = data.data?.repository?.discussion?.comments?.pageInfo; + if (!pageInfo?.hasNextPage || comments.length >= MAX_COMMENTS_PER_RESOURCE) break; + cursor = pageInfo.endCursor; + } + return comments.slice(0, MAX_COMMENTS_PER_RESOURCE); +} + +export async function fetchRepoDocs( + config: GitHubFetchConfig, + docPaths: readonly string[], + branch?: string, + maxItems = 500, +): Promise { + const resources: GitHubResource[] = []; + const errors: { message: string; retryable: boolean }[] = []; + let rateLimitRemaining = 5000; + let rateLimitReset = 0; + + for (const docPath of docPaths) { + if (resources.length >= maxItems) break; + if (docPath.includes("*")) { + const treeResources = await fetchTreeDocs(config, docPath, branch, maxItems - resources.length); + resources.push(...treeResources.resources); + errors.push(...treeResources.errors); + continue; + } + const ref = branch ? `?ref=${branch}` : ""; + const url = `${GITHUB_API_BASE}/repos/${config.owner}/${config.repo}/contents/${docPath}${ref}`; + const response = await githubRequest(url, config.token); + const rl = parseRateLimit(response.headers); + rateLimitRemaining = rl.remaining; + rateLimitReset = rl.reset; + if (response.status === 404) continue; + if (response.status !== 200) { + errors.push({ + message: `Doc fetch failed for ${docPath}: ${response.status}`, + retryable: response.status >= 500, + }); + continue; + } + const data = response.body as { content?: string; sha?: string; name?: string }; + if (!data.content) continue; + const content = Buffer.from(data.content, "base64").toString("utf-8"); + resources.push({ + type: "doc", + path: docPath, + title: (data.name ?? docPath).replace(/\.[^.]+$/, ""), + body: content, + state: "open", + labels: [], + author: null, + createdAt: "", + updatedAt: "", + closedAt: null, + mergedAt: null, + commentsCount: 0, + extra: { sha: data.sha, path: docPath }, + }); + } + return { resources, rateLimitRemaining, rateLimitReset, errors }; +} + +async function fetchTreeDocs( + config: GitHubFetchConfig, + globPath: string, + branch?: string, + maxItems = 100, +): Promise { + const resources: GitHubResource[] = []; + const errors: { message: string; retryable: boolean }[] = []; + const dir = globPath.replace(/\/\*\*\/.*$/, "").replace(/\/\*.*$/, ""); + const matcher = globPath.includes("**/*.md") + ? (p: string) => p.endsWith(".md") + : globPath.includes("*.md") + ? (p: string) => p.endsWith(".md") && !p.includes("/") + : () => false; + + const sha = branch || "HEAD"; + const treeUrl = `${GITHUB_API_BASE}/repos/${config.owner}/${config.repo}/git/trees/${sha}${dir ? `:${dir}` : ""}?recursive=1`; + const treeResponse = await githubRequest(treeUrl, config.token); + if (treeResponse.status !== 200) { + errors.push({ + message: `Tree fetch failed for ${dir}: ${treeResponse.status}`, + retryable: treeResponse.status >= 500, + }); + return { resources, rateLimitRemaining: 5000, rateLimitReset: 0, errors }; + } + const treeData = treeResponse.body as { tree?: Array<{ path: string; type: string }> }; + const entries = (treeData.tree ?? []).filter((e) => e.type === "blob" && matcher(e.path)); + const refParam = branch ? `?ref=${branch}` : ""; + + for (const entry of entries.slice(0, maxItems)) { + const fileUrl = `${GITHUB_API_BASE}/repos/${config.owner}/${config.repo}/contents/${dir ? `${dir}/` : ""}${entry.path}${refParam}`; + const fileResponse = await githubRequest(fileUrl, config.token); + if (fileResponse.status !== 200) { + errors.push({ message: `File fetch failed: ${entry.path}`, retryable: true }); + continue; + } + const data = fileResponse.body as { content?: string; sha?: string; name?: string }; + if (!data.content) continue; + const content = Buffer.from(data.content, "base64").toString("utf-8"); + resources.push({ + type: "doc", + path: dir ? `${dir}/${entry.path}` : entry.path, + title: (data.name ?? entry.path).replace(/\.[^.]+$/, ""), + body: content, + state: "open", + labels: [], + author: null, + createdAt: "", + updatedAt: "", + closedAt: null, + mergedAt: null, + commentsCount: 0, + extra: { sha: data.sha, path: entry.path }, + }); + } + return { resources, rateLimitRemaining: 5000, rateLimitReset: 0, errors }; +} + +export function resourceToMarkdown( + resource: GitHubResource, + comments?: readonly { author: string | null; body: string; createdAt: string }[], +): string { + const parts: string[] = []; + parts.push(`# ${resource.title}`); + parts.push(""); + const meta: string[] = [ + `**Type:** ${resource.type}`, + resource.number != null ? `**Number:** #${resource.number}` : null, + `**State:** ${resource.state}`, + resource.labels.length > 0 ? `**Labels:** ${resource.labels.join(", ")}` : null, + resource.author ? `**Author:** @${resource.author}` : null, + resource.createdAt ? `**Created:** ${resource.createdAt}` : null, + resource.updatedAt ? `**Updated:** ${resource.updatedAt}` : null, + resource.closedAt ? `**Closed:** ${resource.closedAt}` : null, + resource.mergedAt ? `**Merged:** ${resource.mergedAt}` : null, + resource.extra.draft != null ? `**Draft:** ${resource.extra.draft ? "yes" : "no"}` : null, + resource.extra.base && resource.extra.head ? `**Branch:** ${resource.extra.head} → ${resource.extra.base}` : null, + ].filter(Boolean); + parts.push(meta.join(" | ")); + parts.push(""); + if (resource.body.trim()) { + parts.push(resource.body.trim()); + parts.push(""); + } + if (comments && comments.length > 0) { + parts.push("## Comments"); + parts.push(""); + for (const comment of comments) { + const author = comment.author ? `**@${comment.author}**` : "*unknown*"; + parts.push(`${author} (${comment.createdAt}):`); + parts.push(comment.body.trim()); + parts.push(""); + } + } + return parts.join("\n"); +} diff --git a/platform/daemon/src/github-source-graph.test.ts b/platform/daemon/src/github-source-graph.test.ts new file mode 100644 index 000000000..2dca0b4ed --- /dev/null +++ b/platform/daemon/src/github-source-graph.test.ts @@ -0,0 +1,35 @@ +import { describe, expect, test } from "bun:test"; +import { extractGitHubRefs } from "./github-source-graph"; + +describe("extractGitHubRefs", () => { + test("extracts #123 references", () => { + const refs = extractGitHubRefs("Fixes #42 and closes #100", "owner/repo"); + expect(refs.length).toBeGreaterThanOrEqual(2); + const numbers = refs.map((r) => r.number); + expect(numbers).toContain(42); + expect(numbers).toContain(100); + }); + + test("extracts GitHub URLs", () => { + const refs = extractGitHubRefs( + "See https://github.com/owner/repo/pull/55 and https://github.com/owner/repo/issues/77", + "owner/repo", + ); + expect(refs.length).toBeGreaterThanOrEqual(2); + const types = refs.map((r) => r.type); + expect(types).toContain("pull"); + expect(types).toContain("issue"); + }); + + test("deduplicates references", () => { + const refs = extractGitHubRefs("Fixes #42. See also #42.", "owner/repo"); + const numbers = refs.map((r) => r.number); + const unique = new Set(numbers); + expect(unique.size).toBe(numbers.length); + }); + + test("handles empty body", () => { + const refs = extractGitHubRefs("", "owner/repo"); + expect(refs.length).toBe(0); + }); +}); diff --git a/platform/daemon/src/github-source-graph.ts b/platform/daemon/src/github-source-graph.ts new file mode 100644 index 000000000..c4ffda0b7 --- /dev/null +++ b/platform/daemon/src/github-source-graph.ts @@ -0,0 +1,419 @@ +import { createHash } from "node:crypto"; +import type { WriteDb } from "./db-accessor"; +import { getDbAccessor } from "./db-accessor"; +import { requireDependencyReason } from "./dependency-history"; +import type { GitHubResource } from "./github-source-fetch"; + +const GITHUB_SOURCE_KIND = "source_github_resource"; + +export interface IndexGitHubSourceStructureInput { + readonly agentId: string; + readonly sourceId: string; + readonly sourceName: string; + readonly repo: string; + readonly resource: GitHubResource; +} + +export interface PurgeGitHubSourceStructureInput { + readonly agentId?: string; + readonly sourceId: string; +} + +function idFor(...parts: readonly string[]): string { + return `ghsrc_${createHash("sha256").update(parts.join("\0")).digest("hex").slice(0, 28)}`; +} + +function upsertEntity( + db: WriteDb, + input: { + readonly id: string; + readonly name: string; + readonly canonicalName: string; + readonly entityType: string; + readonly agentId: string; + readonly sourceId: string; + readonly sourcePath: string; + readonly now: string; + }, +): string { + const uniqueName = `${input.name} — ${input.canonicalName} — ${input.agentId}`; + const existing = db + .prepare("SELECT id FROM entities WHERE canonical_name = ? AND agent_id = ? LIMIT 1") + .get(input.canonicalName, input.agentId) as { id: string } | undefined; + if (existing) { + db.prepare( + `UPDATE entities + SET name = ?, entity_type = ?, mentions = MAX(COALESCE(mentions, 0), 1), updated_at = ?, + source_id = ?, source_kind = ?, source_path = ?, source_root = ? + WHERE id = ?`, + ).run( + uniqueName, + input.entityType, + input.now, + input.sourceId, + GITHUB_SOURCE_KIND, + input.sourcePath, + input.sourceId, + existing.id, + ); + return existing.id; + } + db.prepare( + `INSERT INTO entities + (id, name, canonical_name, entity_type, agent_id, mentions, created_at, updated_at, + source_id, source_kind, source_path, source_root) + VALUES (?, ?, ?, ?, ?, 1, ?, ?, ?, ?, ?, ?)`, + ).run( + input.id, + uniqueName, + input.canonicalName, + input.entityType, + input.agentId, + input.now, + input.now, + input.sourceId, + GITHUB_SOURCE_KIND, + input.sourcePath, + input.sourceId, + ); + return input.id; +} + +function upsertCommunity( + db: WriteDb, + input: { + readonly id: string; + readonly name: string; + readonly agentId: string; + readonly sourceId: string; + readonly now: string; + }, +): void { + db.prepare( + `INSERT INTO entity_communities + (id, agent_id, name, cohesion, member_count, created_at, updated_at, source_id, source_kind, source_path, source_root) + VALUES (?, ?, ?, 1.0, 0, ?, ?, ?, ?, ?, ?) + ON CONFLICT(id) DO UPDATE SET + name = excluded.name, + updated_at = excluded.updated_at, + source_id = excluded.source_id`, + ).run( + input.id, + input.agentId, + input.name, + input.now, + input.now, + input.sourceId, + GITHUB_SOURCE_KIND, + "", + input.sourceId, + ); +} + +function upsertDependency( + db: WriteDb, + input: { + readonly sourceEntityId: string; + readonly targetEntityId: string; + readonly agentId: string; + readonly type: string; + readonly strength: number; + readonly confidence: number; + readonly reason: string; + readonly sourceId: string; + readonly now: string; + }, +): boolean { + const existing = db + .prepare( + `SELECT id FROM entity_dependencies + WHERE source_entity_id = ? AND target_entity_id = ? AND dependency_type = ? AND agent_id = ? + LIMIT 1`, + ) + .get(input.sourceEntityId, input.targetEntityId, input.type, input.agentId) as { id: string } | undefined; + if (existing) { + db.prepare( + `UPDATE entity_dependencies + SET strength = MAX(strength, ?), confidence = MAX(COALESCE(confidence, 0), ?), + reason = ?, updated_at = ?, source_id = ?, source_kind = ?, source_path = ?, source_root = ? + WHERE id = ?`, + ).run( + input.strength, + input.confidence, + input.reason, + input.now, + input.sourceId, + GITHUB_SOURCE_KIND, + "", + input.sourceId, + existing.id, + ); + return false; + } + const id = idFor("dep", input.agentId, input.type, input.sourceEntityId, input.targetEntityId); + db.prepare( + `INSERT INTO entity_dependencies + (id, source_entity_id, target_entity_id, agent_id, dependency_type, strength, confidence, reason, + created_at, updated_at, source_id, source_kind, source_path, source_root) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ).run( + id, + input.sourceEntityId, + input.targetEntityId, + input.agentId, + input.type, + input.strength, + input.confidence, + input.reason, + input.now, + input.now, + input.sourceId, + GITHUB_SOURCE_KIND, + "", + input.sourceId, + ); + return true; +} + +export function indexGitHubSourceStructure(input: IndexGitHubSourceStructureInput): void { + const now = new Date().toISOString(); + const sourcePath = resourceSourcePath(input.repo, input.resource); + + getDbAccessor().withWriteTx((db) => { + const sourceEntityId = idFor(input.agentId, input.sourceId, "source"); + upsertEntity(db, { + id: sourceEntityId, + name: input.sourceName, + canonicalName: `github:${input.sourceId}`, + entityType: "source", + agentId: input.agentId, + sourceId: input.sourceId, + sourcePath: input.sourceId, + now, + }); + + const repoEntityId = idFor(input.agentId, input.sourceId, "repo", input.repo); + const repoCanonical = `github:${input.sourceId}:${input.repo}`; + upsertEntity(db, { + id: repoEntityId, + name: input.repo, + canonicalName: repoCanonical, + entityType: "source_folder", + agentId: input.agentId, + sourceId: input.sourceId, + sourcePath: `github:${input.repo}`, + now, + }); + const repoCommunityId = idFor(input.agentId, input.sourceId, "community", input.repo); + upsertCommunity(db, { + id: repoCommunityId, + name: input.repo, + agentId: input.agentId, + sourceId: input.sourceId, + now, + }); + db.prepare("UPDATE entities SET community_id = ? WHERE id = ?").run(repoCommunityId, repoEntityId); + upsertDependency(db, { + sourceEntityId, + targetEntityId: repoEntityId, + agentId: input.agentId, + type: "contains", + strength: 1, + confidence: 1, + reason: requireDependencyReason("related_to", `GitHub source contains repo ${input.repo}`), + sourceId: input.sourceId, + now, + }); + + const resourceEntityId = idFor( + input.agentId, + input.sourceId, + "resource", + input.repo, + input.resource.type, + String(input.resource.number ?? input.resource.path), + ); + const resourceCanonical = `github:${input.sourceId}:${sourcePath}`; + upsertEntity(db, { + id: resourceEntityId, + name: resourceDisplayName(input.resource), + canonicalName: resourceCanonical, + entityType: "source_document", + agentId: input.agentId, + sourceId: input.sourceId, + sourcePath: sourcePath, + now, + }); + db.prepare("UPDATE entities SET community_id = ? WHERE id = ?").run(repoCommunityId, resourceEntityId); + upsertDependency(db, { + sourceEntityId: repoEntityId, + targetEntityId: resourceEntityId, + agentId: input.agentId, + type: "contains", + strength: 1, + confidence: 1, + reason: requireDependencyReason( + "related_to", + `GitHub repo ${input.repo} contains ${input.resource.type} ${input.resource.number ?? input.resource.path}`, + ), + sourceId: input.sourceId, + now, + }); + + db.prepare( + "DELETE FROM entity_dependencies WHERE source_entity_id = ? AND agent_id = ? AND source_id = ? AND dependency_type IN ('tagged_with', 'wiki_link')", + ).run(resourceEntityId, input.agentId, input.sourceId); + + for (const label of input.resource.labels) { + const labelEntityId = idFor(input.agentId, input.sourceId, "label", label); + upsertEntity(db, { + id: labelEntityId, + name: label, + canonicalName: `github:${input.sourceId}:${input.repo}:label:${label}`, + entityType: "source_document_reference", + agentId: input.agentId, + sourceId: input.sourceId, + sourcePath: `github:${input.repo}:label:${label}`, + now, + }); + upsertDependency(db, { + sourceEntityId: resourceEntityId, + targetEntityId: labelEntityId, + agentId: input.agentId, + type: "tagged_with", + strength: 0.8, + confidence: 1, + reason: requireDependencyReason("related_to", `GitHub ${input.resource.type} labeled ${label}`), + sourceId: input.sourceId, + now, + }); + } + + const body = input.resource.body ?? ""; + const refs = extractGitHubRefs(body, input.repo); + for (const ref of refs) { + const refEntityId = idFor(input.agentId, input.sourceId, "resource", input.repo, ref.type, String(ref.number)); + upsertEntity(db, { + id: refEntityId, + name: `${ref.type} #${ref.number}`, + canonicalName: `github:${input.sourceId}:github:${input.repo}:${ref.type}:${ref.number}`, + entityType: "source_document_reference", + agentId: input.agentId, + sourceId: input.sourceId, + sourcePath: `github:${input.repo}:${ref.type}:${ref.number}`, + now, + }); + upsertDependency(db, { + sourceEntityId: resourceEntityId, + targetEntityId: refEntityId, + agentId: input.agentId, + type: "wiki_link", + strength: ref.type === "pull" ? 0.9 : 0.7, + confidence: 0.8, + reason: requireDependencyReason( + "related_to", + `GitHub ${input.resource.type} references ${ref.type} #${ref.number}`, + ), + sourceId: input.sourceId, + now, + }); + } + + db.prepare( + `UPDATE entity_communities + SET member_count = ( + SELECT COUNT(*) FROM entities e WHERE e.community_id = entity_communities.id + ), updated_at = ? + WHERE agent_id = ? AND source_id = ?`, + ).run(now, input.agentId, input.sourceId); + }); +} + +function resourceSourcePath(repo: string, resource: GitHubResource): string { + if (resource.type === "doc" && resource.path) return `github:${repo}:docs:${resource.path}`; + return `github:${repo}:${resource.type}:${resource.number}`; +} + +function resourceDisplayName(resource: GitHubResource): string { + if (resource.type === "doc" && resource.path) return resource.path.split("/").pop() ?? resource.path; + return `${resource.type} #${resource.number}: ${resource.title}`; +} + +interface GitHubRef { + readonly type: string; + readonly number: number; +} + +export function extractGitHubRefs(body: string, _repo: string): GitHubRef[] { + const refs = new Map(); + const patterns = [ + /(?:close[sd]?|fix(?:e[sd])?|resolve[sd]?|ref[s]?|see)\s+#(\d+)/gi, + /(?:close[sd]?|fix(?:e[sd])?|resolve[sd]?|ref[s]?|see)\s+https:\/\/github\.com\/[^/]+\/[^/]+\/(issues|pull)\/(\d+)/gi, + /https:\/\/github\.com\/[^/]+\/[^/]+\/(issues|pull)\/(\d+)/gi, + /#(\d+)/g, + ]; + for (const pattern of patterns) { + let match: RegExpExecArray | null; + while ((match = pattern.exec(body))) { + if (pattern === patterns[3]) { + const num = Number(match[1]); + if (num > 0 && num < 1_000_000) { + const key = `issue:${num}`; + if (!refs.has(key)) refs.set(key, { type: "issue", number: num }); + } + } else if (pattern === patterns[1] || pattern === patterns[2]) { + const type = match[1] === "pull" ? "pull" : "issue"; + const num = Number(match[2]); + if (num > 0) refs.set(`${type}:${num}`, { type, number: num }); + } else { + const num = Number(match[1]); + if (num > 0 && num < 1_000_000) { + refs.set(`issue:${num}`, { type: "issue", number: num }); + } + } + } + } + return [...refs.values()]; +} + +export function purgeGitHubSourceStructure(input: PurgeGitHubSourceStructureInput): number { + const agentWhere = input.agentId ? "agent_id = ? AND " : ""; + const params = input.agentId ? [input.agentId, input.sourceId] : [input.sourceId]; + return getDbAccessor().withWriteTx((db) => { + const attrs = db.prepare(`DELETE FROM entity_attributes WHERE ${agentWhere}source_id = ?`).run(...params).changes; + const deps = db.prepare(`DELETE FROM entity_dependencies WHERE ${agentWhere}source_id = ?`).run(...params).changes; + const entities = db.prepare(`DELETE FROM entities WHERE ${agentWhere}source_id = ?`).run(...params).changes; + const communities = db + .prepare(`DELETE FROM entity_communities WHERE ${agentWhere}source_id = ?`) + .run(...params).changes; + return entities + attrs + deps + communities; + }); +} + +export interface PurgeGitHubResourceStructureInput { + readonly sourceId: string; + readonly repo: string; + readonly agentId: string; + readonly resource: GitHubResource; +} + +export function purgeGitHubResourceStructure(input: PurgeGitHubResourceStructureInput): number { + const sourcePath = resourceSourcePath(input.repo, input.resource); + const canonicalName = `github:${input.sourceId}:${sourcePath}`; + return getDbAccessor().withWriteTx((db) => { + const entity = db + .prepare("SELECT id FROM entities WHERE canonical_name = ? AND agent_id = ? LIMIT 1") + .get(canonicalName, input.agentId) as { id: string } | undefined; + if (!entity) return 0; + const attrs = db + .prepare("DELETE FROM entity_attributes WHERE entity_id = ? AND agent_id = ?") + .run(entity.id, input.agentId).changes; + const deps = db + .prepare("DELETE FROM entity_dependencies WHERE (source_entity_id = ? OR target_entity_id = ?) AND agent_id = ?") + .run(entity.id, entity.id, input.agentId).changes; + const entities = db + .prepare("DELETE FROM entities WHERE id = ? AND agent_id = ?") + .run(entity.id, input.agentId).changes; + return entities + attrs + deps; + }); +} diff --git a/platform/daemon/src/routes/sources-routes.ts b/platform/daemon/src/routes/sources-routes.ts index 7d2fb5223..96bbeba4e 100644 --- a/platform/daemon/src/routes/sources-routes.ts +++ b/platform/daemon/src/routes/sources-routes.ts @@ -6,6 +6,7 @@ import { dirname } from "node:path"; import { promisify } from "node:util"; import { type SignetSourceEntry, + addGitHubSource, addObsidianSource, loadSourcesConfig, markSourceIndexed, @@ -13,7 +14,11 @@ import { } from "@signet/core"; import type { Hono } from "hono"; import { resolveDaemonAgentId } from "../agent-id"; +import { requirePermission } from "../auth/middleware"; import { getDbAccessor } from "../db-accessor"; +import { fetchEmbedding } from "../embedding-fetch"; +import { type GitHubSourceBridgeOptions, purgeGitHubSource, syncGitHubSource } from "../github-source-bridge"; +import { loadMemoryConfig } from "../memory-config"; import { type NativeMemoryBridgeHandle, obsidianNativeMemorySource, @@ -35,6 +40,7 @@ import { markSourceIndexJobRunning, updateSourceIndexJobProgress, } from "../source-index-progress"; +import { authConfig } from "./state.js"; interface SourceIndexJobInput { readonly source: SignetSourceEntry; @@ -59,6 +65,18 @@ interface AddObsidianSourceBody { readonly excludeGlobs?: readonly string[]; } +interface AddGitHubSourceBody { + readonly repos?: readonly string[]; + readonly name?: string; + readonly tokenRef?: string; + readonly resourceTypes?: readonly ("issues" | "pulls" | "discussions" | "docs")[]; + readonly state?: "open" | "closed" | "all"; + readonly includeComments?: boolean; + readonly labels?: readonly string[]; + readonly docPaths?: readonly string[]; + readonly maxItemsPerRepo?: number; +} + interface PickDirectoryBody { readonly title?: string; } @@ -125,21 +143,73 @@ export function registerSourcesRoutes(app: Hono, deps: RegisterSourcesRoutesDeps return c.json({ source: result.source, created: result.created, indexed: 0, queued: true, job }, 202); }); - app.delete("/api/sources/:sourceId", (c) => { + app.post("/api/sources/github", requirePermission("admin", authConfig), async (c) => { + let body: AddGitHubSourceBody = {}; + try { + body = (await c.req.json()) as AddGitHubSourceBody; + } catch { + return c.json({ error: "Invalid JSON body" }, 400); + } + + const repos = Array.isArray(body.repos) ? body.repos.filter((r) => typeof r === "string") : []; + if (repos.length === 0) return c.json({ error: "repos is required (e.g. ['owner/repo', 'owner/*'])" }, 400); + + const result = addGitHubSource( + { + repos, + name: body.name, + tokenRef: body.tokenRef, + resourceTypes: body.resourceTypes, + state: body.state, + includeComments: body.includeComments, + labels: body.labels, + docPaths: body.docPaths, + maxItemsPerRepo: body.maxItemsPerRepo, + agentId: resolveDaemonAgentId(), + }, + agentsDir, + ); + if (result.ok === false) return c.json({ error: result.error }, 400); + + const embeddingCfg = loadMemoryConfig(agentsDir); + const ec = embeddingCfg.embedding.provider !== "none" ? embeddingCfg.embedding : undefined; + const job = enqueueGitHubSourceIndexJob(result.source, { + agentsDir, + embeddingConfig: ec, + fetchEmbedding: ec ? fetchEmbedding : undefined, + }); + + return c.json({ source: result.source, created: result.created, queued: true, job }, 202); + }); + + app.delete("/api/sources/:sourceId", requirePermission("admin", authConfig), async (c) => { const sourceId = c.req.param("sourceId"); + const currentAgentId = resolveDaemonAgentId(); + const source = loadSourcesConfig(agentsDir).sources.find((entry) => entry.id === sourceId); + if (!source) return c.json({ error: `Source not found: ${sourceId}` }, 404); + if (source.agentId && source.agentId !== currentAgentId) { + return c.json({ error: "Source is owned by a different agent" }, 403); + } + if (source.kind === "github" && isSourceIndexInFlight(source.id)) { + cancelSourceIndexJob(source.id); + return c.json({ error: "Source indexing is in flight; cancellation requested, retry deletion shortly" }, 409); + } + const result = removeSource(sourceId, agentsDir); if (result.ok === false) return c.json({ error: result.error }, 404); cancelSourceIndexJob(result.source.id); - const sourceAgentId = resolveDaemonAgentId(); + const sourceAgentId = currentAgentId; recordSourceDeletionTombstone(result.source, sourceAgentId, agentsDir); - const purged = - result.source.kind === "obsidian" - ? purgeNativeSource( - obsidianNativeMemorySource(result.source.root, result.source.name, result.source.id), - sourceAgentId, - ) - : 0; + let purged = 0; + if (result.source.kind === "obsidian") { + purged = purgeNativeSource( + obsidianNativeMemorySource(result.source.root, result.source.name, result.source.id), + sourceAgentId, + ); + } else if (result.source.kind === "github") { + purged = await purgeGitHubSource(result.source.id, sourceAgentId); + } if (!isSourceIndexInFlight(result.source.id)) clearSourceDeletionTombstone(result.source.id, sourceAgentId, agentsDir); return c.json({ source: result.source, purged }); @@ -212,6 +282,46 @@ function scheduleSourceIndexJob(input: SourceIndexJobInput, job: SourceIndexJob, }, delayMs).unref?.(); } +function enqueueGitHubSourceIndexJob(source: SignetSourceEntry, options: GitHubSourceBridgeOptions): SourceIndexJob { + const job = beginSourceIndexJob(source.id, "github-source-index"); + setTimeout(() => { + if (!isCurrentSourceIndexJob(source.id, job.id)) return; + if (isSourceIndexInFlight(source.id)) { + setTimeout(() => void runGitHubSourceIndexJob(source, options, job), 50).unref?.(); + return; + } + markSourceIndexInFlight(source.id); + if (!markSourceIndexJobRunning(source.id, job.id)) { + clearSourceIndexInFlight(source.id); + return; + } + void runGitHubSourceIndexJob(source, options, job); + }, 0).unref?.(); + return job; +} + +async function runGitHubSourceIndexJob( + source: SignetSourceEntry, + options: GitHubSourceBridgeOptions, + job: SourceIndexJob, +): Promise { + try { + const result = await syncGitHubSource(source, options); + if (!isCurrentSourceIndexJob(source.id, job.id)) return; + if (result.hadErrors) { + failSourceIndexJob(source.id, job.id, "GitHub source sync completed with partial errors"); + } else { + markSourceIndexed(source.id, undefined, options.agentsDir); + completeSourceIndexJob(source.id, job.id, result.indexed); + } + } catch (err) { + if (!isCurrentSourceIndexJob(source.id, job.id)) return; + failSourceIndexJob(source.id, job.id, err); + } finally { + clearSourceIndexInFlight(source.id); + } +} + function cleanupSourceDeletionTombstones( agentsDir: string, purgeNativeSource: typeof purgeNativeMemorySourceArtifacts, @@ -232,6 +342,8 @@ function cleanupSourceDeletionTombstones( ), tombstone.agentId, ); + } else if (tombstone.source.kind === "github") { + purgeGitHubSource(tombstone.source.id, tombstone.agentId); } } saveSourceDeletionTombstones(remaining, agentsDir); @@ -294,7 +406,7 @@ function isSourceDeletionTombstone(value: unknown): value is SourceDeletionTombs !!candidate.source && typeof candidate.source === "object" && typeof candidate.source.id === "string" && - candidate.source.kind === "obsidian" + (candidate.source.kind === "obsidian" || candidate.source.kind === "github") ); } diff --git a/surfaces/cli/src/commands/sources.ts b/surfaces/cli/src/commands/sources.ts index c4c38a533..114c6ecb2 100644 --- a/surfaces/cli/src/commands/sources.ts +++ b/surfaces/cli/src/commands/sources.ts @@ -1,5 +1,11 @@ import type { Command } from "commander"; -import { type SourcesDeps, addObsidianVaultSource, listSources, removeConfiguredSource } from "../features/sources.js"; +import { + type SourcesDeps, + addGitHubRepoSource, + addObsidianVaultSource, + listSources, + removeConfiguredSource, +} from "../features/sources.js"; import type { DaemonApiCall } from "../lib/daemon.js"; export interface RegisterSourcesCommandsDeps extends SourcesDeps { @@ -62,4 +68,17 @@ export function registerSourcesCommands(program: Command, deps: RegisterSourcesC .action((path: string, options: { name?: string; exclude?: string[] }) => addObsidianVaultSource(path, options, deps), ); + + add + .command("github") + .description("Index GitHub repos (issues, PRs, discussions, docs) as a recall source") + .requiredOption("--repos ", "Repo patterns (owner/repo or owner/*)") + .option("--name ", "Display name for this source") + .option("--token-ref ", "Signet secret reference for GitHub PAT") + .option("--types ", "Resource types: issues,pulls,discussions,docs", "issues,pulls,discussions,docs") + .option("--state ", "Filter by state: open, closed, all", "all") + .option("--no-comments", "Skip fetching comments") + .option("--doc-paths ", "Doc file paths to index", ["README.md", "CHANGELOG.md"]) + .option("--max-items ", "Max items per repo", "500") + .action((options) => addGitHubRepoSource(options, deps)); } diff --git a/surfaces/cli/src/features/sources.ts b/surfaces/cli/src/features/sources.ts index 0b27b39dd..71318c3dc 100644 --- a/surfaces/cli/src/features/sources.ts +++ b/surfaces/cli/src/features/sources.ts @@ -1,4 +1,4 @@ -import { addObsidianSource, loadSourcesConfig, removeSource } from "@signet/core"; +import { addGitHubSource, addObsidianSource, loadSourcesConfig, removeSource } from "@signet/core"; import chalk from "chalk"; export interface SourcesDeps { @@ -40,6 +40,85 @@ export async function addObsidianVaultSource( console.log(chalk.dim("Run `signet daemon restart` if the daemon is already running.")); } +export async function addGitHubRepoSource( + options: { + readonly repos?: readonly string[]; + readonly name?: string; + readonly tokenRef?: string; + readonly types?: string; + readonly state?: string; + readonly comments?: boolean; + readonly docPaths?: readonly string[]; + readonly maxItems?: string; + }, + deps: SourcesDeps, +): Promise { + const repos = options.repos ?? []; + if (repos.length === 0) { + console.error(chalk.red("✗ --repos is required (e.g. --repos owner/repo owner/*)")); + process.exitCode = 1; + return; + } + const validTypes = new Set(["issues", "pulls", "discussions", "docs"]); + const rawTypes = options.types + ? options.types + .split(",") + .map((t) => t.trim()) + .filter(Boolean) + : undefined; + if (options.types && (!rawTypes || rawTypes.length === 0)) { + console.error(chalk.red("✗ --types must include at least one resource type")); + process.exitCode = 1; + return; + } + if (rawTypes && rawTypes.length > 0) { + const invalid = rawTypes.filter((t) => !validTypes.has(t)); + if (invalid.length > 0) { + console.error( + chalk.red(`✗ Invalid resource types: ${invalid.join(", ")}. Must be one of: issues, pulls, discussions, docs`), + ); + process.exitCode = 1; + return; + } + } + const resourceTypes = rawTypes as ("issues" | "pulls" | "discussions" | "docs")[] | undefined; + if (options.state && !["open", "closed", "all"].includes(options.state)) { + console.error(chalk.red(`✗ Invalid state: ${options.state}. Must be one of: open, closed, all`)); + process.exitCode = 1; + return; + } + const maxItems = options.maxItems ? Number(options.maxItems) : undefined; + + const result = addGitHubSource( + { + repos, + name: options.name, + tokenRef: options.tokenRef, + resourceTypes, + state: options.state as "open" | "closed" | "all" | undefined, + includeComments: options.comments, + docPaths: options.docPaths, + maxItemsPerRepo: maxItems, + agentId: process.env.SIGNET_AGENT_ID?.trim() || undefined, + }, + deps.agentsDir, + ); + if (result.ok === false) { + console.error(chalk.red(`✗ ${result.error}`)); + process.exitCode = 1; + return; + } + + const verb = result.created ? "Added" : "Updated"; + console.log(chalk.green(`✓ ${verb} GitHub source: ${result.source.name}`)); + console.log(chalk.dim(` repos: ${repos.join(", ")}`)); + if (options.tokenRef) console.log(chalk.dim(` token: ${options.tokenRef}`)); + console.log(chalk.dim(` types: ${(resourceTypes ?? ["issues", "pulls", "discussions", "docs"]).join(", ")}`)); + console.log(); + console.log(chalk.dim("The daemon indexes GitHub sources on startup and polls every 5 minutes.")); + console.log(chalk.dim("Run `signet daemon restart` if the daemon is already running.")); +} + export async function listSources(deps: SourcesDeps): Promise { const config = loadSourcesConfig(deps.agentsDir); if (config.sources.length === 0) {