diff --git a/cli/README.md b/cli/README.md index e4a2e199..06eabe8c 100644 --- a/cli/README.md +++ b/cli/README.md @@ -247,6 +247,7 @@ notifications: discord_webhook: "https://discord.com/api/webhooks/..." slack_webhook: "https://hooks.slack.com/services/..." custom_webhook: "https://your-api.com/webhook" + telemetry_webhook: "https://your-api.com/telemetry" # optional telemetry export hook ``` Notifications include task completion counts and status (completed/failed). diff --git a/cli/src/config/constants.ts b/cli/src/config/constants.ts new file mode 100644 index 00000000..04789fde --- /dev/null +++ b/cli/src/config/constants.ts @@ -0,0 +1,80 @@ +export const PROGRESS_UPDATE_INTERVAL = 500; +export const HEARTBEAT_INTERVAL = 5000; +// Default retry count used by CLI/runtime options. +export const DEFAULT_MAX_RETRIES = 3; +// Legacy alias retained for older modules. +export const MAX_RETRIES = DEFAULT_MAX_RETRIES; +export const UI_LABELS = { + PLANNING: "[PLANNING]", + EXECUTION: "[EXECUTION]", + DONE: "[DONE]", + FAIL: "[FAIL]", + OK: "[OK]", +}; +export const SPINNER_CHARS = ["|", "/", "-", "\\"]; +export const MAX_EXECUTION_TIME = 300000; // 5 minutes +export const PROGRESS_POLL_INTERVAL = 2000; +export const WATCHER_DEBOUNCE = 250; +export const PLANNING_COOLDOWN = 2000; + +// CLI Defaults +export const DEFAULT_RETRY_DELAY = 5; +export const DEFAULT_MAX_PARALLEL = 3; +export const DEFAULT_MAX_REPLANS = 3; +export const DEFAULT_MAX_ITERATIONS = 0; + +// AI Engine Defaults +export const DEFAULT_AI_ENGINE_TIMEOUT_MS = 80 * 60 * 1000; // 80 minutes +export const STREAM_HEARTBEAT_INTERVAL_MS = 30000; // 30 seconds without output = potential hang + +// Parallel Execution +export const CACHE_TTL_MS = 24 * 60 * 60 * 1000; // 24 hours +export const INITIAL_POOL_SIZE_MULTIPLIER = 1; +export const MAX_POOL_SIZE_MULTIPLIER = 5; +export const PLANNING_CONCURRENCY = 5; +export const POOL_INCREMENT = 2; + +// Progress Monitoring +export const MAX_OPERATIONS_HISTORY = 10; +export const RECENT_ACTIONS_COUNT = 3; +export const FIND_WORKTREE_RETRIES = 20; +export const MAX_DISPLAYED_ACTIONS = 3; + +// Sandbox Management +export const DEFAULT_MAX_SANDBOX_AGE_MS = 60 * 60 * 1000; // 1 hour +export const SANDBOX_STALE_THRESHOLD_MS = 24 * 60 * 60 * 1000; // 24 hours +export const SANDBOX_BACKGROUND_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes +export const MS_PER_MINUTE = 60000; +export const CLEANUP_DELAY_MS = 5000; +export const COPY_BACK_CONCURRENCY = 10; +export const SANDBOX_DIR_PREFIX = "agent-"; +export const SANDBOX_SUFFIX = ""; +export const DEFAULT_IGNORE_PATTERNS = [ + ".git", + "node_modules", + ".ralphy-sandboxes", + ".ralphy-worktrees", + ".ralphy", + "agent-*", + "sandbox-*", +]; + +// File Utilities +export const MAX_FILE_SIZE_FOR_HASH = 2 * 1024 * 1024; // 2MB +export const DEFAULT_RECURSION_DEPTH = 5; + +// Lock Management +export const LOCK_TIMEOUT_MS = 30000; // 30 seconds +export const LOCK_MAX_LOCKS = 5000; // Maximum number of locks +export const LOCK_DIR = ".ralphy/locks"; // Lock directory +export const LOCK_CLEANUP_INTERVAL_MS = 60000; // 1 minute between cleanups + +// Path Constants +export const SANDBOX_DIR = ".ralphy-worktrees"; +export const PLANNING_CACHE_FILE = "planning-cache.json"; + +// Hash Store Constants +export const HASH_STORE_DIR = ".ralphy-hashes"; +export const HASH_STORE_MAX_AGE_MS = 24 * 60 * 60 * 1000; // 24 hours +export const HASH_REFERENCE_SUFFIX = ".hash-ref"; +export const ENABLE_HASH_STORE = true; // Feature flag diff --git a/cli/src/config/types.ts b/cli/src/config/types.ts index 6a292aa9..22c8f2b3 100644 --- a/cli/src/config/types.ts +++ b/cli/src/config/types.ts @@ -17,6 +17,7 @@ export const NotificationsSchema = z.object({ discord_webhook: z.string().default(""), slack_webhook: z.string().default(""), custom_webhook: z.string().default(""), + telemetry_webhook: z.string().default(""), }); /** @@ -109,6 +110,8 @@ export interface RuntimeOptions { browserEnabled: "auto" | "true" | "false"; /** Override default model for the engine */ modelOverride?: string; + /** Enable comprehensive OpenCode debugging */ + debugOpenCode?: boolean; /** Skip automatic branch merging after parallel execution */ skipMerge?: boolean; /** Use lightweight sandboxes instead of git worktrees for parallel execution */ @@ -142,4 +145,5 @@ export const DEFAULT_OPTIONS: RuntimeOptions = { githubLabel: "", autoCommit: true, browserEnabled: "auto", + debugOpenCode: false, }; diff --git a/cli/src/telemetry/types.ts b/cli/src/telemetry/types.ts index 41650f3c..ddb4eb86 100644 --- a/cli/src/telemetry/types.ts +++ b/cli/src/telemetry/types.ts @@ -78,6 +78,51 @@ export interface ToolCall { */ export type TelemetryLevel = "anonymous" | "full"; +/** + * Full session data for webhook + */ +interface WebhookSessionData { + sessionId: string; + engine: string; + mode: string; + cliVersion: string; + platform: string; + totalTokensIn: number; + totalTokensOut: number; + totalDurationMs: number; + taskCount: number; + successCount: number; + failedCount: number; + toolCalls: { + toolName: string; + callCount: number; + successCount: number; + failedCount: number; + avgDurationMs: number; + }[]; + tags?: string[]; +} + +/** + * Full session details for webhook (full privacy mode) + */ +interface WebhookSessionDetails { + prompt?: string; + response?: string; + filePaths?: string[]; +} + +/** + * Telemetry webhook payload + */ +export interface TelemetryWebhookPayload { + event: string; + version: string; + timestamp: string; + session: WebhookSessionData; + details?: WebhookSessionDetails; +} + /** * Telemetry configuration */ diff --git a/cli/src/ui/logger.ts b/cli/src/ui/logger.ts index 324e719d..c9c5ea1b 100644 --- a/cli/src/ui/logger.ts +++ b/cli/src/ui/logger.ts @@ -1,49 +1,428 @@ +import { appendFileSync, mkdirSync } from "node:fs"; +import path, { dirname } from "node:path"; import pc from "picocolors"; +import { sanitizeSecrets } from "../utils/sanitization.ts"; -let verboseMode = false; +// Use a module-level object for state to avoid direct mutable exports +const loggerState = { + verboseMode: false, + debugMode: false, +}; + +/** + * Validate log file path to prevent path traversal attacks + */ +function validateLogPath(filePath: string): string { + if (!filePath || typeof filePath !== "string") { + throw new Error("Invalid log file path"); + } + + if (filePath.includes("\0")) { + throw new Error("Invalid log file path: null byte detected"); + } + + return path.isAbsolute(filePath) + ? path.normalize(filePath) + : path.resolve(process.cwd(), filePath); +} + +/** + * Get current verbose mode state + */ +export function isVerbose(): boolean { + return loggerState.verboseMode; +} + +/** + * Get current debug mode state + */ +export function isDebug(): boolean { + return loggerState.debugMode; +} /** * Set verbose mode */ export function setVerbose(verbose: boolean): void { - verboseMode = verbose; + loggerState.verboseMode = verbose; + verboseMode = loggerState.verboseMode; +} + +/** + * Set debug mode (implies verbose) + */ +export function setDebug(debug: boolean): void { + loggerState.debugMode = debug; + if (debug) { + loggerState.verboseMode = true; + } + verboseMode = loggerState.verboseMode; +} + +// BUG FIX: Export a getter function instead of a stale primitive value +// This ensures consumers always get the current state, not the initial value +export function getVerboseMode(): boolean { + return loggerState.verboseMode; +} + +// Keep backward compatibility export but mark as deprecated +/** @deprecated Use getVerboseMode() instead for live value */ +export let verboseMode = loggerState.verboseMode; + +/** + * Log levels for structured logging + */ +export type LogLevel = "debug" | "info" | "success" | "warn" | "error"; + +/** + * Structured log entry interface + */ +export interface LogEntry { + timestamp: string; + level: LogLevel; + component: string; + message: string; + context?: Record; +} + +/** + * Log sink interface for extensible logging + */ +export interface LogSink { + write(entry: LogEntry): void; +} + +interface DisposableLogSink extends LogSink { + dispose(): void; +} + +/** + * Default console log sink with colors + */ +class ConsoleLogSink implements LogSink { + write(entry: LogEntry): void { + // Defensive: validate entry has required fields + if (!entry || typeof entry !== "object") { + console.error("[Logger] Invalid log entry"); + return; + } + const timestamp = entry.timestamp ?? new Date().toISOString(); + const level = entry.level ?? "info"; + const component = entry.component ?? "ralphy"; + const message = entry.message ?? ""; + const prefix = `[${timestamp}] [${level.toUpperCase()}]`; + + switch (level) { + case "error": + console.error(pc.red(`${prefix} ${component ? `[${component}] ` : ""}${message}`)); + break; + case "warn": + console.warn(pc.yellow(`${prefix} ${component ? `[${component}] ` : ""}${message}`)); + break; + case "success": + console.log(pc.green(`${prefix} ${component ? `[${component}] ` : ""}${message}`)); + break; + case "info": + console.log(pc.blue(`${prefix} ${component ? `[${component}] ` : ""}${message}`)); + break; + case "debug": + console.log(pc.gray(`${prefix} ${component ? `[${component}] ` : ""}${message}`)); + break; + default: + console.log(`${prefix} ${component ? `[${component}] ` : ""}${message}`); + } + } +} + +// Global log sink instance +let logSink: LogSink = new ConsoleLogSink(); + +/** + * Set a custom log sink for extensible logging + */ +export function setLogSink(sink: LogSink): void { + if (logSink !== sink && typeof (logSink as Partial).dispose === "function") { + (logSink as DisposableLogSink).dispose(); + } + logSink = sink; +} + +/** + * Get current log sink + */ +export function getLogSink(): LogSink { + return logSink; +} + +/** + * Internal function to create log entry + * Sanitizes secrets from logged data + */ +function createLogEntry(level: LogLevel, component: string | undefined, args: unknown[]): LogEntry { + const rawMessage = args + .map((arg) => (typeof arg === "string" ? arg : JSON.stringify(arg))) + .join(" "); + // Sanitize secrets from the message + const message = sanitizeSecrets(rawMessage); + + return { + timestamp: new Date().toISOString(), + level, + component: component || "ralphy", + message, + }; +} + +/** + * Core logging function + */ +function log(level: LogLevel, component: string | undefined, ...args: unknown[]): void { + // Debug messages only show in verbose or debug mode + if (level === "debug" && !loggerState.verboseMode) { + return; + } + + const entry = createLogEntry(level, component, args); + logSink.write(entry); } /** * Log info message */ export function logInfo(...args: unknown[]): void { - console.log(pc.blue("[INFO]"), ...args); + log("info", undefined, ...args); +} + +/** + * Log info message with component context + */ +export function logInfoContext(component: string, ...args: unknown[]): void { + log("info", component, ...args); } /** * Log success message */ export function logSuccess(...args: unknown[]): void { - console.log(pc.green("[OK]"), ...args); + log("success", undefined, ...args); +} + +/** + * Log success message with component context + */ +export function logSuccessContext(component: string, ...args: unknown[]): void { + log("success", component, ...args); } /** * Log warning message */ export function logWarn(...args: unknown[]): void { - console.log(pc.yellow("[WARN]"), ...args); + log("warn", undefined, ...args); +} + +/** + * Log warning message with component context + */ +export function logWarnContext(component: string, ...args: unknown[]): void { + log("warn", component, ...args); } /** * Log error message */ export function logError(...args: unknown[]): void { - console.error(pc.red("[ERROR]"), ...args); + log("error", undefined, ...args); +} + +/** + * Log error message with component context + */ +export function logErrorContext(component: string, ...args: unknown[]): void { + log("error", component, ...args); } /** * Log debug message (only in verbose mode) */ export function logDebug(...args: unknown[]): void { - if (verboseMode) { - console.log(pc.dim("[DEBUG]"), ...args); + log("debug", undefined, ...args); +} + +/** + * Log debug message with component context + */ +export function logDebugContext(component: string, ...args: unknown[]): void { + log("debug", component, ...args); +} + +/** + * JSON file log sink for structured logging to file + */ +export class JsonFileLogSink implements LogSink { + private filePath: string; + private buffer: LogEntry[] = []; + private flushInterval: number; + private maxBufferSize: number; + // BUG FIX: Use proper nullable type instead of type cast hack + private flushTimer: NodeJS.Timeout | null = null; + + constructor(filePath: string, options?: { flushIntervalMs?: number; maxBufferSize?: number }) { + // Validate path to prevent path traversal attacks + this.filePath = validateLogPath(filePath); + mkdirSync(dirname(this.filePath), { recursive: true }); + this.flushInterval = options?.flushIntervalMs ?? 1000; + this.maxBufferSize = options?.maxBufferSize ?? 100; + + // Auto-flush buffer periodically + this.flushTimer = setInterval(() => { + try { + this.flush(); + } catch (err) { + console.error(`Failed to flush log buffer: ${err}`); + } + }, this.flushInterval); + } + + private isFlushing = false; + + write(entry: LogEntry): void { + this.buffer.push(entry); + + if (this.buffer.length >= this.maxBufferSize) { + this.flush(); + } + } + + private flush(): void { + // Prevent concurrent flushes (race condition fix) + if (this.isFlushing || this.buffer.length === 0) return; + + this.isFlushing = true; + let currentBuffer: LogEntry[] = []; + + try { + // ATOMIC: Swap buffers instead of copy-then-clear + // This prevents race conditions where write() is called between copy and clear + currentBuffer = this.buffer; + this.buffer = []; // New empty buffer assigned atomically + const lines = `${currentBuffer.map((entry) => JSON.stringify(entry)).join("\n")}\n`; + appendFileSync(this.filePath, lines, "utf-8"); + } catch (error) { + console.error(`Failed to write to log file: ${error}`); + // Limit buffer size to prevent memory exhaustion on persistent write failures + const MAX_BUFFER_SIZE = 10000; + const combined = [...currentBuffer, ...this.buffer]; + // Keep only the most recent entries, discard oldest to prevent memory leak + if (combined.length > MAX_BUFFER_SIZE) { + this.buffer = combined.slice(-MAX_BUFFER_SIZE); + console.warn(`Log buffer truncated to ${MAX_BUFFER_SIZE} entries due to write failure`); + } else { + this.buffer = combined; + } + } finally { + this.isFlushing = false; + } + } + + /** + * Dispose of the file log sink, stopping the flush timer. + * Call this when done logging to prevent memory leaks. + */ + dispose(): void { + // BUG FIX: Proper nullable type check without type cast hack + if (this.flushTimer !== null) { + clearInterval(this.flushTimer); + this.flushTimer = null; + } + // Final flush to ensure all logs are written + this.flush(); + } +} + +/** + * Multi-sink that writes to multiple log sinks + */ +export class MultiLogSink implements LogSink { + private sinks: LogSink[]; + + constructor(sinks: LogSink[]) { + this.sinks = sinks; + } + + write(entry: LogEntry): void { + for (const sink of this.sinks) { + try { + sink.write(entry); + } catch (error) { + console.error(`Log sink failed: ${error}`); + } + } + } + + addSink(sink: LogSink): void { + this.sinks.push(sink); + } + + dispose(): void { + for (const sink of this.sinks) { + if (typeof (sink as Partial).dispose === "function") { + (sink as DisposableLogSink).dispose(); + } + } + } +} + +/** + * Filtered log sink that only passes certain log levels + */ +export class FilteredLogSink implements LogSink { + private sink: LogSink; + private minLevel: LogLevel; + private levelPriority: Record = { + debug: 0, + info: 1, + success: 2, + warn: 3, + error: 4, + }; + + constructor(sink: LogSink, minLevel: LogLevel) { + this.sink = sink; + this.minLevel = minLevel; + } + + write(entry: LogEntry): void { + if (this.levelPriority[entry.level] >= this.levelPriority[this.minLevel]) { + this.sink.write(entry); + } + } + + dispose(): void { + if (typeof (this.sink as Partial).dispose === "function") { + (this.sink as DisposableLogSink).dispose(); + } + } +} + +/** + * Initialize structured logging with file output + * @param logFilePath - Path to JSON log file (optional) + * @param minLevel - Minimum log level to record (default: "info") + */ +export function initializeStructuredLogging( + logFilePath?: string, + minLevel: LogLevel = "info", +): void { + const sinks: LogSink[] = [new ConsoleLogSink()]; + + if (logFilePath) { + const fileSink = new JsonFileLogSink(logFilePath); + const filteredFileSink = new FilteredLogSink(fileSink, minLevel); + sinks.push(filteredFileSink); } + + setLogSink(new MultiLogSink(sinks)); } /** diff --git a/cli/src/utils/ai-output-parser.ts b/cli/src/utils/ai-output-parser.ts new file mode 100644 index 00000000..a5e6b5fc --- /dev/null +++ b/cli/src/utils/ai-output-parser.ts @@ -0,0 +1,155 @@ +import { parseJsonLine, TextSchema, ToolUseSchema } from "./json-validation.ts"; + +export interface ParsedStep { + thought?: string; + tool?: string; + toolArgs?: string; + reading?: string; + writing?: string; + running?: string; + executed?: string; + raw?: string; +} + +/** + * Parse AI output step and extract structured information + */ +export function parseAIStep(step: string): ParsedStep { + const parsed: ParsedStep = { raw: step }; + + // Try to parse as JSON first + const result = parseJsonLine(step); + if (result) { + const { event, remaining } = result; + + // If there's remaining text, treat it as a thought/progress message + if (remaining) { + parsed.thought = remaining; + } + + // Extract text/response content + const textResult = TextSchema.safeParse(event); + if (textResult.success) { + const textEvent = textResult.data; + if (textEvent.part?.text) { + const text = textEvent.part.text; + + // Try to categorize based on content patterns + if (text.startsWith("[thinking") || text.startsWith("Thinking")) { + parsed.thought = text; + } else if (text.startsWith("Running") || text.startsWith("Executing")) { + parsed.running = text; + } else if (text.startsWith("Reading") || text.startsWith("Examining")) { + parsed.reading = text; + } else if (text.startsWith("Writing") || text.startsWith("Creating") || text.startsWith("Updating")) { + parsed.writing = text; + } else if (text.startsWith("Executed") || text.startsWith("Finished")) { + parsed.executed = text; + } else { + // Default: if it's text content, treat as general progress + parsed.thought = text; + } + } + } + + // Extract tool use content + const toolUseResult = ToolUseSchema.safeParse(event); + if (toolUseResult.success) { + const toolUse = toolUseResult.data; + const toolName = toolUse.tool || toolUse.part?.tool || ""; + const toolInput = toolUse.part?.state?.input; + + if (toolName) { + parsed.tool = toolName; + + // Extract meaningful summary of arguments + let argSummary = ""; + if (toolInput) { + if (typeof toolInput === "string") { + argSummary = toolInput; + } else if (typeof toolInput === "object" && toolInput !== null && !Array.isArray(toolInput)) { + // Heuristics for common tools + const input = toolInput as Record; + argSummary = + String(input.pattern || "") || + String(input.path || "") || + String(input.filePath || "") || + String(input.command || "") || + JSON.stringify(toolInput); + } + } + + parsed.toolArgs = argSummary; + + // Map specific tools to display categories with refined messages + const lowerTool = toolName.toLowerCase(); + const shortArgs = argSummary; + + if (lowerTool === "read") { + parsed.reading = `Read: ${shortArgs} `; + } else if (lowerTool === "glob") { + parsed.reading = `Glob: ${shortArgs} `; + } else if (lowerTool === "grep") { + parsed.reading = `Grep: ${shortArgs} `; + } else if (lowerTool === "ls") { + parsed.reading = `List: ${shortArgs} `; + } else if (lowerTool === "write" || lowerTool === "edit" || lowerTool === "create") { + parsed.writing = `Write: ${shortArgs} `; + } else if (lowerTool === "run" || lowerTool === "execute" || lowerTool === "terminal") { + parsed.running = `Run: ${shortArgs} `; + } else { + // Catch-all for other tools + parsed.tool = `${toolName}: ${shortArgs} `; + } + } + } + } + + return parsed; +} + +/** + * Format parsed step for display + */ +export function formatParsedStep(step: ParsedStep, agentNum?: number): string | null { + const prefix = agentNum !== undefined ? `Agent ${agentNum}: ` : ""; + + // Prioritize concrete actions over generic thoughts + if (step.writing) { + return `${prefix}${step.writing} `; + } + + if (step.reading) { + return `${prefix}${step.reading} `; + } + + if (step.running) { + return `${prefix}${step.running} `; + } + + if (step.thought) { + // Clean up common "Thinking:" prefixes since we wrap in {} later + const cleanedThought = step.thought.replace(/^(Thinking|Analyzing|Considering|Warning|Waiting)[:\s]*/i, "").trim(); + return `${prefix}${cleanedThought} `; + } + + if (step.executed) { + return `${prefix} Done: ${step.executed.substring(0, 100)} `; + } + + if (step.tool) { + return `${prefix} Tool: ${step.tool} `; + } + + // If raw but couldn't parse, truncate and show + if (step.raw && step.raw.length > 0) { + const trimmed = step.raw.trim(); + if (trimmed.startsWith("{") || trimmed.startsWith('"')) { + // It's JSON we couldn't parse, skip it + return null; + } + return `${prefix}${trimmed.substring(0, 100)} `; + } + + return null; +} diff --git a/cli/src/utils/cleanup.ts b/cli/src/utils/cleanup.ts new file mode 100644 index 00000000..82f1db9c --- /dev/null +++ b/cli/src/utils/cleanup.ts @@ -0,0 +1,146 @@ +import type { ChildProcess } from "node:child_process"; +import { spawnSync } from "node:child_process"; +import { logDebug, logWarn } from "../ui/logger.ts"; + +type CleanupFn = () => Promise | void; + +const cleanupRegistry: Set = new Set(); +const trackedProcesses: Set = new Set(); +let isCleaningUp = false; + +function isProcessRunning(proc: ChildProcess): boolean { + return proc.exitCode === null && proc.signalCode === null; +} + +/** + * Register a function to be called on process exit or manual cleanup + */ +export function registerCleanup(fn: CleanupFn): () => void { + cleanupRegistry.add(fn); + return () => cleanupRegistry.delete(fn); +} + +/** + * Register a child process to be tracked and killed on exit + */ +export function registerProcess(proc: ChildProcess): () => void { + trackedProcesses.add(proc); + + const remove = () => trackedProcesses.delete(proc); + + proc.on("exit", remove); + proc.on("error", remove); + + return remove; +} + +/** + * Run all registered cleanup functions and kill tracked processes + */ +export async function runCleanup(): Promise { + if (isCleaningUp) return; + isCleaningUp = true; + + // 1. Kill all tracked child processes with verification + for (const proc of trackedProcesses) { + try { + if (proc.pid && isProcessRunning(proc)) { + const pid = proc.pid; + + if (process.platform === "win32") { + // Windows needs taskkill for robust child tree termination + const result = spawnSync("taskkill", ["/pid", String(pid), "/f", "/t"], { + stdio: "pipe", + }); + + // Verify the process was actually killed + // Status 128 = process already exited, which is fine + if (result.status !== 0 && result.status !== 128) { + logWarn(`taskkill may have failed for PID ${pid} (exit code: ${result.status})`); + if (result.stderr) { + logDebug(`taskkill stderr: ${result.stderr.toString()}`); + } + } + + await new Promise((resolve) => setTimeout(resolve, 500)); + if (isProcessRunning(proc)) { + logWarn(`Process ${pid} may still be running after taskkill`); + } + } else { + // Try graceful termination first + proc.kill("SIGTERM"); + + // Wait a bit and verify it's dead + await new Promise((resolve) => setTimeout(resolve, 1000)); + + // Check if process is still running + if (isProcessRunning(proc)) { + proc.kill("SIGKILL"); + + // Final verification + await new Promise((resolve) => setTimeout(resolve, 500)); + if (isProcessRunning(proc)) { + logWarn(`Failed to terminate process ${pid} after SIGKILL`); + } + } + } + } + } catch (err) { + // Process termination failed, continue cleanup + logDebug(`Failed to terminate process ${proc.pid}: ${err}`); + } + } + trackedProcesses.clear(); + + // 2. Run registered cleanup functions + const promises: Promise[] = []; + for (const fn of cleanupRegistry) { + try { + const result = fn(); + if (result instanceof Promise) { + promises.push(result); + } + } catch (err) { + // Log sync errors but continue with other cleanup functions + promises.push(Promise.reject(err)); + } + } + + await Promise.allSettled(promises); + cleanupRegistry.clear(); + isCleaningUp = false; +} + +let isShuttingDown = false; + +/** + * Setup process signal handlers for cleanup + */ +export function setupSignalHandlers(): void { + const signals: NodeJS.Signals[] = ["SIGINT", "SIGTERM"]; + + for (const signal of signals) { + process.on(signal, async () => { + // Prevent duplicate cleanup runs + if (isShuttingDown) { + process.stdout.write(`\nReceived ${signal}, cleanup already in progress...\n`); + return; + } + isShuttingDown = true; + + // Use writeSync to avoid event loop issues during exit + process.stdout.write(`\nReceived ${signal}, cleaning up processes and files...\n`); + + try { + await runCleanup(); + process.exit(0); + } catch (error) { + process.stderr.write(`\nCleanup failed: ${error}\n`); + process.exit(1); + } + }); + } + + // Note: uncaughtException is handled in cli/src/index.ts for the main process + // This avoids duplicate handlers and ensures consistent error handling +} diff --git a/cli/src/utils/errors.ts b/cli/src/utils/errors.ts new file mode 100644 index 00000000..763eab33 --- /dev/null +++ b/cli/src/utils/errors.ts @@ -0,0 +1,140 @@ +/** + * Standardized error handling utilities for consistent error types across the codebase + */ + +export class RalphyError extends Error { + public readonly code: string; + public readonly context?: Record; + + constructor(message: string, code = "RALPHY_ERROR", context?: Record) { + super(message); + this.name = "RalphyError"; + this.code = code; + this.context = context; + + // Maintains proper stack trace for where our error was thrown (only available on V8) + if (Error.captureStackTrace) { + Error.captureStackTrace(this, RalphyError); + } + } +} + +export class ValidationError extends RalphyError { + constructor(message: string, context?: Record) { + super(message, "VALIDATION_ERROR", context); + this.name = "ValidationError"; + } +} + +export class TimeoutError extends RalphyError { + constructor(message: string, context?: Record) { + super(message, "TIMEOUT_ERROR", context); + this.name = "TimeoutError"; + } +} + +export class LockError extends RalphyError { + constructor(message: string, context?: Record) { + super(message, "LOCK_ERROR", context); + this.name = "LockError"; + } +} + +export class ProcessError extends RalphyError { + constructor(message: string, context?: Record) { + super(message, "PROCESS_ERROR", context); + this.name = "ProcessError"; + } +} + +export class SandboxError extends RalphyError { + constructor(message: string, context?: Record) { + super(message, "SANDBOX_ERROR", context); + this.name = "SandboxError"; + } +} + +/** + * Convert any error to a standardized format + */ +export function standardizeError(error: unknown): RalphyError { + if (error instanceof RalphyError) { + return error; + } + + if (error instanceof Error) { + const withMetadata = error as Error & { + code?: string; + context?: Record; + cause?: unknown; + }; + + return new RalphyError(error.message, "UNKNOWN_ERROR", { + originalName: error.name, + originalStack: error.stack, + ...(withMetadata.code ? { originalCode: withMetadata.code } : {}), + ...(withMetadata.context ? { originalContext: withMetadata.context } : {}), + ...(withMetadata.cause ? { originalCause: String(withMetadata.cause) } : {}), + }); + } + + if (typeof error === "string") { + return new RalphyError(error, "STRING_ERROR"); + } + + return new RalphyError(String(error), "UNKNOWN_ERROR", { originalType: typeof error }); +} + +/** + * Check if an error is retryable + */ +export function isRetryableError(error: unknown): boolean { + const standardized = standardizeError(error); + + const retryableCodes = ["TIMEOUT_ERROR", "LOCK_ERROR", "PROCESS_ERROR", "NETWORK_ERROR", "RATE_LIMIT_ERROR"]; + + const retryableMessages = [ + "timeout", + "connection refused", + "network", + "rate limit", + "too many requests", + "temporary failure", + "try again", + "locked", + "conflict", + "connection error", + "unable to connect", + "internet connection", + "econnrefused", + "econnreset", + "socket hang up", + "fetch failed", + ]; + + const message = standardized.message.toLowerCase(); + + // Check error code + if (retryableCodes.includes(standardized.code)) { + return true; + } + + // Check error message + return retryableMessages.some((pattern) => message.includes(pattern)); +} + +/** + * Create error with context for logging + */ +export function createErrorWithContext(error: unknown, context: Record): RalphyError { + const standardized = standardizeError(error); + + if (standardized.context) { + return new RalphyError(standardized.message, standardized.code, { + ...standardized.context, + ...context, + }); + } + + return new RalphyError(standardized.message, standardized.code, context); +} diff --git a/cli/src/utils/file-indexer.ts b/cli/src/utils/file-indexer.ts new file mode 100644 index 00000000..9e060705 --- /dev/null +++ b/cli/src/utils/file-indexer.ts @@ -0,0 +1,1069 @@ +/** + * File Indexer Module + * + * Provides semantic chunking for large codebases and file hash caching for unchanged files. + * This module indexes the codebase with file metadata (path, hash, size, mtime, keywords) + * and provides semantic search to find relevant files based on task keywords. + */ + +import { createHash } from "node:crypto"; +import { existsSync, rmSync } from "node:fs"; +import { + mkdir as mkdirAsync, + readdir as readdirAsync, + readFile as readFileAsync, + stat as statAsync, + writeFile as writeFileAsync, +} from "node:fs/promises"; +import { join, relative } from "node:path"; +import { CACHE_TTL_MS, DEFAULT_IGNORE_PATTERNS, MAX_FILE_SIZE_FOR_HASH } from "../config/constants.ts"; +import { RALPHY_DIR } from "../config/loader.ts"; +import { logDebug } from "../ui/logger.ts"; + +// Constants +const FILE_INDEX_CACHE = "file-index.json"; +const MAX_KEYWORDS_PER_FILE = 20; +const MAX_CONTENT_PREVIEW_LENGTH = 500; +const RELEVANCE_THRESHOLD = 0.1; + +/** + * Maximum glob pattern length to prevent ReDoS attacks + */ +const MAX_GLOB_PATTERN_LENGTH = 1000; +const GLOB_REGEX_CACHE_MAX_ENTRIES = 500; +const GLOB_REGEX_CACHE_TTL_MS = 5 * 60 * 1000; + +const globRegexCache = new Map(); + +function isCacheFresh(index: FileIndex): boolean { + return Date.now() - index.timestamp <= CACHE_TTL_MS; +} + +/** + * File metadata entry in the index + */ +export interface FileIndexEntry { + /** Relative path from workspace root */ + path: string; + /** File content hash (sha256, first 16 chars) */ + hash: string; + /** File size in bytes */ + size: number; + /** Last modification time (ms since epoch) */ + mtime: number; + /** Extracted keywords from path and content */ + keywords: string[]; + /** Content preview for semantic analysis */ + preview?: string; + /** File extension */ + extension: string; + /** Directory depth */ + depth: number; +} + +/** + * The complete file index for a workspace + */ +export interface FileIndex { + /** Version for cache invalidation */ + version: number; + /** Timestamp of index creation */ + timestamp: number; + /** Workspace root path */ + workDir: string; + /** Map of relative paths to file entries */ + files: Map; + /** Total files indexed */ + totalFiles: number; + /** Total size of all indexed files */ + totalSize: number; +} + +/** + * Serialized version of FileIndex for JSON storage + */ +interface SerializedFileIndex { + version: number; + timestamp: number; + workDir: string; + files: Record; + totalFiles: number; + totalSize: number; +} + +// In-memory cache of file indexes +const indexCache = new Map(); + +// Track promises for workspaces being indexed to allow waiting +const indexingPromises = new Map>(); + +/** + * Deep clone a FileIndex to return an immutable copy + * Prevents callers from modifying the shared cache + */ +function cloneFileIndex(index: FileIndex): FileIndex { + return { + version: index.version, + timestamp: index.timestamp, + workDir: index.workDir, + files: new Map(index.files), + totalFiles: index.totalFiles, + totalSize: index.totalSize, + }; +} + +/** + * Get the path to the file index cache + */ +function getIndexCachePath(workDir: string): string { + return join(workDir, RALPHY_DIR, FILE_INDEX_CACHE); +} + +/** + * Check if a file should be ignored based on patterns + */ +function shouldIgnoreFile(filePath: string, ignorePatterns: string[]): boolean { + const normalizedPath = filePath.replace(/\\/g, "/"); + + for (const pattern of ignorePatterns) { + if (matchesGlob(normalizedPath, pattern)) { + return true; + } + } + + return false; +} + +/** + * Convert glob pattern to regex + */ +function matchesGlob(filePath: string, pattern: string): boolean { + // Handle ** patterns properly + const regexPattern = globToRegex(pattern); + return regexPattern.test(filePath); +} + +/** + * Convert glob pattern to regex + * + * SECURITY NOTE: This function includes protections against ReDoS attacks: + * - Input length is limited to MAX_GLOB_PATTERN_LENGTH + * - Uses non-backtracking patterns where possible + */ +function globToRegex(pattern: string): RegExp { + const safePattern = + pattern.length > MAX_GLOB_PATTERN_LENGTH + ? pattern.slice(0, MAX_GLOB_PATTERN_LENGTH) + : pattern; + + const now = Date.now(); + const cached = globRegexCache.get(safePattern); + if (cached && cached.expiresAt > now) { + return cached.regex; + } + + // Limit pattern length to prevent ReDoS attacks + if (safePattern.length < pattern.length) { + logDebug(`Glob pattern too long (${pattern.length} > ${MAX_GLOB_PATTERN_LENGTH}), truncating`); + } + + // Escape special regex characters except * and ? + // Use a bounded approach to prevent catastrophic backtracking + let regex = safePattern + .replace(/[.+^${}()|[\]\\]/g, "\\$&") + .replace(/\*\*/g, "\0DOUBLESTAR\0") // Temporarily mark ** + .replace(/\*/g, "[^/]*") // Single * matches anything except / + .replace(/\?/g, "[^/]"); // ? matches single char except / + + // Handle ** (match any number of directories) using non-capturing group + // The (?:.*/)? pattern is bounded - it won't cause catastrophic backtracking + regex = regex.replace(/\0DOUBLESTAR\0/g, "(?:.*/)?"); + + // Handle directory separators + regex = regex.replace(/\//g, "[/\\\\]"); + + // Anchor to start + regex = `^${regex}`; + + // Match at end if pattern doesn't end with /** + if (!safePattern.endsWith("/**")) { + regex += "$"; + } + + const compiled = new RegExp(regex, "i"); + + if (globRegexCache.size >= GLOB_REGEX_CACHE_MAX_ENTRIES) { + for (const [key, value] of globRegexCache) { + if (value.expiresAt <= now) { + globRegexCache.delete(key); + } + } + if (globRegexCache.size >= GLOB_REGEX_CACHE_MAX_ENTRIES) { + const oldestKey = globRegexCache.keys().next().value; + if (oldestKey) { + globRegexCache.delete(oldestKey); + } + } + } + + globRegexCache.set(safePattern, { regex: compiled, expiresAt: now + GLOB_REGEX_CACHE_TTL_MS }); + return compiled; +} + +/** + * Extract keywords from a file path + */ +function extractPathKeywords(filePath: string): string[] { + const keywords = new Set(); + + // Split path into components + const parts = filePath.split(/[/\\]/); + + for (const part of parts) { + // Skip empty parts and common non-descriptive names + if (!part || part === "." || part === "..") continue; + + // Extract words from camelCase, PascalCase, snake_case, kebab-case + const words = part + .replace(/\.[^.]+$/, "") // Remove extension + .split(/[_-]/) // Split by underscore and hyphen + .flatMap((word) => { + // Split camelCase/PascalCase + return word + .replace(/([a-z])([A-Z])/g, "$1 $2") + .split(/\s+/) + .filter((w) => w.length > 2); + }); + + for (const word of words) { + const lower = word.toLowerCase(); + if (isSignificantKeyword(lower)) { + keywords.add(lower); + } + } + + // Add the full filename (without extension) as a keyword + const nameWithoutExt = part.replace(/\.[^.]+$/, "").toLowerCase(); + if (nameWithoutExt.length > 2 && !isCommonWord(nameWithoutExt)) { + keywords.add(nameWithoutExt); + } + } + + // Add extension as keyword + const ext = filePath.split(".").pop()?.toLowerCase(); + if (ext && ext !== filePath) { + keywords.add(ext); + } + + return Array.from(keywords); +} + +/** + * Extract keywords from file content + */ +function extractContentKeywords(content: string, maxKeywords = 10): string[] { + const keywords = new Set(); + + // Extract function/class/variable names from code + const patterns = [ + // Function declarations + /(?:function|def|fn|func)\s+(\w+)/g, + // Class declarations + /(?:class|interface|type|struct)\s+(\w+)/g, + // Variable declarations (const, let, var) + /(?:const|let|var)\s+(\w+)\s*[=:]/g, + // Export declarations + /export\s+(?:default\s+)?(?:class|function|const|let|var)?\s*(\w+)/g, + // Import statements - extract imported names + /import\s+{([^}]+)}/g, + // Python imports + /from\s+\S+\s+import\s+([^\n]+)/g, + // Go/Rust function signatures + /fn\s+(\w+)\s*\(/g, + // React components (PascalCase functions) + /const\s+([A-Z][a-zA-Z0-9]*)\s*[:=]/g, + ]; + + for (const pattern of patterns) { + let match: RegExpExecArray | null = null; + // biome-ignore lint/suspicious/noAssignInExpressions: Standard regex loop pattern + while ((match = pattern.exec(content)) !== null) { + const names = match[1] + .split(/[,\s]+/) + .map((n) => n.trim()) + .filter((n) => n.length > 2 && isSignificantKeyword(n.toLowerCase())); + + for (const name of names) { + keywords.add(name.toLowerCase()); + } + } + } + + // Extract common words that appear frequently + const words = content.toLowerCase().match(/\b[a-z]{3,}\b/g) || []; + + const wordFreq = new Map(); + for (const word of words) { + if (!isCommonWord(word) && isSignificantKeyword(word)) { + wordFreq.set(word, (wordFreq.get(word) || 0) + 1); + } + } + + // Add most frequent words + const sortedWords = Array.from(wordFreq.entries()) + .sort((a, b) => b[1] - a[1]) + .slice(0, maxKeywords); + + for (const [word] of sortedWords) { + keywords.add(word); + } + + return Array.from(keywords).slice(0, maxKeywords); +} + +/** + * Check if a word is a common/insignificant word + */ +function isCommonWord(word: string): boolean { + const commonWords = new Set([ + "the", + "and", + "for", + "are", + "but", + "not", + "you", + "all", + "can", + "had", + "her", + "was", + "one", + "our", + "out", + "day", + "get", + "has", + "him", + "his", + "how", + "its", + "may", + "new", + "now", + "old", + "see", + "two", + "who", + "boy", + "did", + "she", + "use", + "way", + "many", + "oil", + "sit", + "set", + "run", + "eat", + "far", + "sea", + "eye", + "ago", + "off", + "too", + "any", + "say", + "man", + "try", + "ask", + "end", + "why", + "let", + "put", + "own", + "tell", + "very", + "when", + "come", + "here", + "just", + "like", + "long", + "make", + "over", + "such", + "take", + "than", + "them", + "well", + "were", + "will", + "with", + "have", + "from", + "they", + "know", + "want", + "been", + "good", + "much", + "some", + "time", + "this", + "that", + "would", + "there", + "their", + "what", + "said", + "each", + "which", + "about", + "could", + "other", + "after", + "first", + "never", + "these", + "think", + "where", + "being", + "every", + "great", + "might", + "shall", + "still", + "those", + "while", + "true", + "false", + "null", + "undefined", + "return", + "import", + "export", + "default", + "async", + "await", + "yield", + "throw", + "catch", + "finally", + "break", + "continue", + "switch", + "case", + "try", + "new", + ]); + return commonWords.has(word.toLowerCase()); +} + +/** + * Check if a keyword is significant (not too short, not numeric) + */ +function isSignificantKeyword(word: string): boolean { + if (word.length < 3) return false; + if (/^\d+$/.test(word)) return false; + if (/^[0-9a-f]{8,}$/i.test(word)) return false; // Likely a hash + return true; +} + +/** + * Extract keywords from a task description + */ +export function extractTaskKeywords(taskDescription: string): string[] { + const keywords = new Set(); + + // Extract file paths mentioned in the task + const pathMatches = taskDescription.match(/[\w\-./\\]+\.[\w]+/g) || []; + for (const path of pathMatches) { + const pathKeywords = extractPathKeywords(path); + for (const kw of pathKeywords) { + keywords.add(kw); + } + } + + // Extract camelCase/PascalCase words (likely identifiers) + const identifierMatches = taskDescription.match(/\b[a-z]+[A-Z][a-zA-Z0-9]*\b/g) || []; + for (const id of identifierMatches) { + const words = id + .replace(/([a-z])([A-Z])/g, "$1 $2") + .split(/\s+/) + .filter((w) => w.length > 2); + for (const word of words) { + keywords.add(word.toLowerCase()); + } + } + + // Extract technical terms and concepts + const techTerms = taskDescription.match(/\b[A-Z][a-z]+[A-Z][a-zA-Z]+\b/g) || []; + for (const term of techTerms) { + keywords.add(term.toLowerCase()); + } + + // Extract words that look like file names or components + const componentMatches = + taskDescription.match( + /\b[A-Z][a-zA-Z0-9]*(?:Component|Module|Service|Handler|Controller|Model|View|Util|Helper|Manager|Store|Context|Provider|Hook)\b/g, + ) || []; + for (const comp of componentMatches) { + keywords.add(comp.toLowerCase()); + } + + // Extract all significant words + const allWords = taskDescription.toLowerCase().match(/\b[a-z]{3,}\b/g) || []; + + for (const word of allWords) { + if (!isCommonWord(word) && isSignificantKeyword(word)) { + keywords.add(word); + } + } + + return Array.from(keywords); +} + +/** + * Calculate relevance score between task keywords and file entry + */ +function calculateRelevanceScore(taskKeywords: string[], fileEntry: FileIndexEntry): number { + let score = 0; + const fileKeywords = new Set(fileEntry.keywords); + + for (const taskKw of taskKeywords) { + // Exact match in file keywords + if (fileKeywords.has(taskKw)) { + score += 1.0; + continue; + } + + // Partial match (task keyword is substring of file keyword or vice versa) + for (const fileKw of fileKeywords) { + if (fileKw.includes(taskKw) || taskKw.includes(fileKw)) { + score += 0.5; + break; + } + } + + // Check if keyword appears in path + if (fileEntry.path.toLowerCase().includes(taskKw)) { + score += 0.3; + } + } + + // Normalize by number of task keywords + return taskKeywords.length > 0 ? score / taskKeywords.length : 0; +} + +/** + * Create a file index entry for a single file + */ +async function createFileIndexEntry( + filePath: string, + relPath: string, + maxSizeForContent = MAX_FILE_SIZE_FOR_HASH, +): Promise { + try { + const stat = await statAsync(filePath); + + if (!stat.isFile()) return null; + + // Calculate hash + let hash = ""; + let preview = ""; + let contentKeywords: string[] = []; + + if (stat.size <= maxSizeForContent) { + try { + const content = await readFileAsync(filePath, "utf-8"); + hash = createHash("sha256").update(content).digest("hex").slice(0, 16); + preview = content.slice(0, MAX_CONTENT_PREVIEW_LENGTH); + contentKeywords = extractContentKeywords(content, 10); + } catch { + // Binary or unreadable file - use mtime+size as pseudo-hash + hash = createHash("sha256").update(`${stat.mtimeMs}-${stat.size}`).digest("hex").slice(0, 16); + } + } else { + // Large file - use mtime+size as pseudo-hash + hash = createHash("sha256").update(`${stat.mtimeMs}-${stat.size}`).digest("hex").slice(0, 16); + } + + // Extract path keywords + const pathKeywords = extractPathKeywords(relPath); + + // Combine keywords + const allKeywords = [...new Set([...pathKeywords, ...contentKeywords])].slice(0, MAX_KEYWORDS_PER_FILE); + + // Get extension + const ext = relPath.split(".").pop()?.toLowerCase() || ""; + + // Calculate depth + const depth = relPath.split(/[/\\]/).length - 1; + + return { + path: relPath, + hash, + size: stat.size, + mtime: stat.mtimeMs, + keywords: allKeywords, + preview, + extension: ext, + depth, + }; + } catch (error) { + logDebug(`Failed to index file ${filePath}: ${error}`); + return null; + } +} + +/** + * Index all files in a directory recursively + * + * Thread-safe: Returns a cloned copy to prevent cache corruption. + * Concurrent calls for the same workspace will wait for a single indexing operation. + */ +export async function indexWorkspace( + workDir: string, + options: { + ignorePatterns?: string[]; + forceRebuild?: boolean; + maxDepth?: number; + } = {}, +): Promise { + const { ignorePatterns = DEFAULT_IGNORE_PATTERNS, forceRebuild = false, maxDepth = 50 } = options; + + // Check memory cache first - return a clone to prevent mutation + const cached = indexCache.get(workDir); + if (!forceRebuild && cached) { + if (isCacheFresh(cached)) { + return cloneFileIndex(cached); + } + indexCache.delete(workDir); + } + + // Check if another operation is already indexing this workspace + const existingPromise = indexingPromises.get(workDir); + if (existingPromise && !forceRebuild) { + logDebug(`Waiting for concurrent indexing of ${workDir}...`); + const result = await existingPromise; + // Return a clone even from the concurrent operation's result + return cloneFileIndex(result); + } + + // Create the indexing promise to lock this workspace + const indexingPromise = performIndexing(workDir, ignorePatterns, forceRebuild, maxDepth); + indexingPromises.set(workDir, indexingPromise); + + try { + const result = await indexingPromise; + // Return a cloned copy to prevent cache corruption + return cloneFileIndex(result); + } finally { + // Always clean up the promise lock + indexingPromises.delete(workDir); + } +} + +/** + * Perform the actual indexing operation + */ +async function performIndexing( + workDir: string, + ignorePatterns: string[], + forceRebuild: boolean, + maxDepth: number, +): Promise { + // Double-check cache after acquiring lock (another thread may have completed) + const cached = indexCache.get(workDir); + if (!forceRebuild && cached) { + if (isCacheFresh(cached)) { + return cached; + } + indexCache.delete(workDir); + } + + // Try to load from disk cache + if (!forceRebuild) { + const diskCache = await loadIndexFromDisk(workDir); + if (diskCache) { + // Perform incremental update + const updated = await incrementalUpdateIndex(workDir, diskCache, ignorePatterns, maxDepth); + indexCache.set(workDir, updated); + await saveIndexToDisk(workDir, updated); + return updated; + } + } + + // Build fresh index + const index: FileIndex = { + version: 1, + timestamp: Date.now(), + workDir, + files: new Map(), + totalFiles: 0, + totalSize: 0, + }; + + // Ensure .ralphy directory exists + const ralphyDir = join(workDir, RALPHY_DIR); + if (!existsSync(ralphyDir)) { + await mkdirAsync(ralphyDir, { recursive: true }); + } + + // Collect all files + const filesToIndex: string[] = []; + + async function collectFiles(dir: string, currentDepth: number): Promise { + if (currentDepth > maxDepth) return; + + try { + const entries = await readdirAsync(dir, { withFileTypes: true }); + for (const entry of entries) { + const fullPath = join(dir, entry.name); + const relPath = relative(workDir, fullPath); + + if (shouldIgnoreFile(relPath, ignorePatterns)) { + continue; + } + + if (entry.isDirectory()) { + await collectFiles(fullPath, currentDepth + 1); + } else if (entry.isFile()) { + filesToIndex.push(fullPath); + } + } + } catch (error) { + logDebug(`Failed to read directory ${dir}: ${error}`); + } + } + + await collectFiles(workDir, 0); + + // Index all collected files + for (const filePath of filesToIndex) { + const relPath = relative(workDir, filePath); + const entry = await createFileIndexEntry(filePath, relPath); + if (entry) { + index.files.set(relPath, entry); + index.totalFiles++; + index.totalSize += entry.size; + } + } + + // Cache and save + indexCache.set(workDir, index); + await saveIndexToDisk(workDir, index); + + logDebug(`Indexed ${index.totalFiles} files (${(index.totalSize / 1024 / 1024).toFixed(2)} MB)`); + + return index; +} + +/** + * Perform incremental update of file index + */ +async function incrementalUpdateIndex( + workDir: string, + existingIndex: FileIndex, + ignorePatterns: string[], + maxDepth: number, +): Promise { + const updatedIndex: FileIndex = { + version: existingIndex.version, + timestamp: Date.now(), + workDir, + files: new Map(existingIndex.files), + totalFiles: 0, + totalSize: 0, + }; + + const currentFiles = new Set(); + let reindexedCount = 0; + let unchangedCount = 0; + let removedCount = 0; + + async function scanDirectory(dir: string, currentDepth: number): Promise { + if (currentDepth > maxDepth) return; + + try { + const entries = await readdirAsync(dir, { withFileTypes: true }); + for (const entry of entries) { + const fullPath = join(dir, entry.name); + const relPath = relative(workDir, fullPath); + + if (shouldIgnoreFile(relPath, ignorePatterns)) { + continue; + } + + if (entry.isDirectory()) { + await scanDirectory(fullPath, currentDepth + 1); + } else if (entry.isFile()) { + currentFiles.add(relPath); + + const existingEntry = updatedIndex.files.get(relPath); + const stat = await statAsync(fullPath); + + if (existingEntry && existingEntry.mtime === stat.mtimeMs && existingEntry.size === stat.size) { + // File unchanged - keep existing entry + unchangedCount++; + } else { + // File changed or new - reindex + const newEntry = await createFileIndexEntry(fullPath, relPath); + if (newEntry) { + updatedIndex.files.set(relPath, newEntry); + reindexedCount++; + } + } + } + } + } catch (error) { + logDebug(`Failed to scan directory ${dir}: ${error}`); + } + } + + await scanDirectory(workDir, 0); + + // Remove deleted files from index + for (const [relPath] of updatedIndex.files) { + if (!currentFiles.has(relPath)) { + updatedIndex.files.delete(relPath); + removedCount++; + } + } + + // Recalculate totals + for (const entry of updatedIndex.files.values()) { + updatedIndex.totalFiles++; + updatedIndex.totalSize += entry.size; + } + + logDebug( + `Incremental index update: ${unchangedCount} unchanged, ${reindexedCount} reindexed, ${removedCount} removed`, + ); + + return updatedIndex; +} + +/** + * Load index from disk cache + */ +async function loadIndexFromDisk(workDir: string): Promise { + const cachePath = getIndexCachePath(workDir); + + if (!existsSync(cachePath)) { + return null; + } + + try { + const content = await readFileAsync(cachePath, "utf-8"); + const serialized: SerializedFileIndex = JSON.parse(content); + + return { + version: serialized.version, + timestamp: serialized.timestamp, + workDir: serialized.workDir, + files: new Map(Object.entries(serialized.files)), + totalFiles: serialized.totalFiles, + totalSize: serialized.totalSize, + }; + } catch (error) { + logDebug(`Failed to load file index from disk: ${error}`); + return null; + } +} + +/** + * Save index to disk cache + */ +async function saveIndexToDisk(workDir: string, index: FileIndex): Promise { + const cachePath = getIndexCachePath(workDir); + + try { + const serialized: SerializedFileIndex = { + version: index.version, + timestamp: index.timestamp, + workDir: index.workDir, + files: Object.fromEntries(index.files), + totalFiles: index.totalFiles, + totalSize: index.totalSize, + }; + + await writeFileAsync(cachePath, JSON.stringify(serialized, null, 2)); + } catch (error) { + logDebug(`Failed to save file index to disk: ${error}`); + } +} + +/** + * Get relevant files for a task based on semantic matching + */ +export async function getRelevantFilesForTask( + workDir: string, + taskDescription: string, + options: { + maxFiles?: number; + minRelevance?: number; + includeExtensions?: string[]; + excludeExtensions?: string[]; + } = {}, +): Promise { + const { + maxFiles = 50, + minRelevance = RELEVANCE_THRESHOLD, + includeExtensions, + excludeExtensions = ["log", "lock", "map", "min.js", "min.css"], + } = options; + + // Get or build file index + const index = await indexWorkspace(workDir); + + // Extract keywords from task + const taskKeywords = extractTaskKeywords(taskDescription); + logDebug(`Task keywords: ${taskKeywords.join(", ")}`); + + if (taskKeywords.length === 0) { + // No keywords extracted - return most recently modified files as fallback + return Array.from(index.files.values()) + .sort((a, b) => b.mtime - a.mtime) + .slice(0, maxFiles) + .map((e) => e.path); + } + + // Score all files + const scoredFiles: Array<{ path: string; score: number; entry: FileIndexEntry }> = []; + + for (const [path, entry] of index.files) { + // Filter by extension + if (includeExtensions && !includeExtensions.includes(entry.extension)) { + continue; + } + if (excludeExtensions.includes(entry.extension)) { + continue; + } + + const score = calculateRelevanceScore(taskKeywords, entry); + if (score >= minRelevance) { + scoredFiles.push({ path, score, entry }); + } + } + + // Sort by score (descending), then by mtime (most recent first for ties) + scoredFiles.sort((a, b) => { + if (b.score !== a.score) { + return b.score - a.score; + } + return b.entry.mtime - a.entry.mtime; + }); + + // Take top N files + const relevantFiles = scoredFiles.slice(0, maxFiles).map((s) => s.path); + + logDebug(`Found ${relevantFiles.length} relevant files for task (scored ${scoredFiles.length} total)`); + + return relevantFiles; +} + +/** + * Get file hash from index (useful for caching unchanged files) + */ +export async function getFileHashFromIndex(workDir: string, relPath: string): Promise { + const index = await indexWorkspace(workDir); + const entry = index.files.get(relPath); + return entry?.hash ?? null; +} + +/** + * Check if a file has changed based on index + */ +export async function hasFileChanged(workDir: string, relPath: string, expectedHash: string): Promise { + const currentHash = await getFileHashFromIndex(workDir, relPath); + if (currentHash === null) { + return true; // File not in index, assume changed + } + return currentHash !== expectedHash; +} + +/** + * Get file metadata from index + */ +export async function getFileMetadata(workDir: string, relPath: string): Promise { + const index = await indexWorkspace(workDir); + return index.files.get(relPath) ?? null; +} + +/** + * Clear the file index cache (both memory and disk) + */ +export function clearFileIndexCache(workDir: string): void { + indexCache.delete(workDir); + const cachePath = getIndexCachePath(workDir); + try { + if (existsSync(cachePath)) { + rmSync(cachePath); + } + } catch (error) { + logDebug(`Failed to clear file index cache: ${error}`); + } +} + +/** + * Get index statistics + */ +export async function getIndexStats(workDir: string): Promise<{ + totalFiles: number; + totalSize: number; + avgFileSize: number; + lastUpdated: number; +}> { + const index = await indexWorkspace(workDir); + return { + totalFiles: index.totalFiles, + totalSize: index.totalSize, + avgFileSize: index.totalFiles > 0 ? index.totalSize / index.totalFiles : 0, + lastUpdated: index.timestamp, + }; +} + +/** + * Force rebuild the file index + */ +export async function rebuildFileIndex(workDir: string): Promise { + clearFileIndexCache(workDir); + return indexWorkspace(workDir, { forceRebuild: true }); +} + +/** + * Find files by keyword (simple search) + */ +export async function findFilesByKeyword( + workDir: string, + keyword: string, + options: { maxResults?: number } = {}, +): Promise { + const { maxResults = 20 } = options; + const index = await indexWorkspace(workDir); + const results: FileIndexEntry[] = []; + const lowerKeyword = keyword.toLowerCase(); + + for (const entry of index.files.values()) { + // Check if keyword is in path + if (entry.path.toLowerCase().includes(lowerKeyword)) { + results.push(entry); + continue; + } + + // Check if keyword is in keywords + if (entry.keywords.some((k) => k.includes(lowerKeyword) || lowerKeyword.includes(k))) { + results.push(entry); + continue; + } + + // Check preview for code files + if (entry.preview?.toLowerCase().includes(lowerKeyword)) { + results.push(entry); + } + } + + return results.slice(0, maxResults); +} diff --git a/cli/src/utils/json-validation.ts b/cli/src/utils/json-validation.ts new file mode 100644 index 00000000..e5a069dc --- /dev/null +++ b/cli/src/utils/json-validation.ts @@ -0,0 +1,179 @@ +import { z } from "zod"; + +export const StepFinishSchema = z.object({ + type: z.literal("step_finish"), + part: z + .object({ + tokens: z + .object({ + input: z.number().optional(), + output: z.number().optional(), + }) + .optional(), + input: z.number().optional(), + output: z.number().optional(), + cost: z.number().optional(), + }) + .optional(), + tokens: z + .object({ + input: z.number().optional(), + output: z.number().optional(), + }) + .optional(), + cost: z.number().optional(), + // Session ID fields (various naming conventions) + sessionID: z.string().optional(), + sessionId: z.string().optional(), + session_id: z.string().optional(), +}); + +export const StepStartSchema = z.object({ + type: z.literal("step_start"), +}); + +export const TextSchema = z.object({ + type: z.literal("text"), + part: z.object({ + text: z.string(), + }), +}); + +export const ErrorSchema = z.object({ + type: z.literal("error"), + error: z + .object({ + message: z.string().optional(), + }) + .optional(), + message: z.string().optional(), +}); + +export const ResultSchema = z.object({ + type: z.literal("result"), + result: z.string().optional(), + usage: z + .object({ + input_tokens: z.number().optional(), + output_tokens: z.number().optional(), + }) + .optional(), +}); + +export const ToolUseSchema = z.object({ + type: z.literal("tool_use"), + part: z + .object({ + tool: z.string().optional(), + state: z + .object({ + input: z + .union([z.string(), z.number(), z.boolean(), z.object({}).passthrough(), z.array(z.unknown())]) + .optional(), + status: z.string().optional(), + }) + .optional(), + }) + .optional(), + tool: z.string().optional(), + callID: z.string().optional(), +}); + +export const StreamJsonEventSchema = z.union([ + StepFinishSchema, + StepStartSchema, + TextSchema, + ErrorSchema, + ResultSchema, + ToolUseSchema, +]); + +export type StreamJsonEvent = z.infer; +export type StepFinish = z.infer; +export type TextEvent = z.infer; +export type ToolUseEvent = z.infer; + +/** + * Safely parse a JSON line with schema validation + * Returns null if parsing fails or schema is invalid + * + * This function handles: + * - Complete JSON objects + * - JSON followed by additional text (splits at proper object boundary) + * - Truncated JSON (attempts to recover) + * - Special characters in strings (properly handles escape sequences) + */ +export function parseJsonLine(line: string): { event: StreamJsonEvent; remaining?: string } | null { + try { + const trimmed = line.trim(); + if (!trimmed) return null; + + // Handle cases where JSON is followed by text without a newline + // Search for the end of the JSON object + let jsonStr = trimmed; + let remaining: string | undefined; + + if (trimmed.startsWith("{")) { + let depth = 0; + let inString = false; + let isEscaped = false; + let jsonEndIndex = -1; + + for (let i = 0; i < trimmed.length; i++) { + const char = trimmed[i]; + if (isEscaped) { + isEscaped = false; + continue; + } + if (char === "\\") { + isEscaped = true; + continue; + } + if (char === '"' && !isEscaped) { + inString = !inString; + continue; + } + if (!inString) { + if (char === "{") depth++; + if (char === "}") { + depth--; + if (depth === 0) { + jsonEndIndex = i; + break; + } + } + } + } + + // If we found a complete JSON object, split it from any remaining text + if (jsonEndIndex >= 0) { + jsonStr = trimmed.substring(0, jsonEndIndex + 1); + remaining = trimmed.substring(jsonEndIndex + 1).trim(); + } + // If we didn't find a complete object but started with '{', + // it might be truncated - still try to parse what we have + } + + const parsed = JSON.parse(jsonStr); + const event = StreamJsonEventSchema.parse(parsed); + return { event, remaining: remaining || undefined }; + } catch { + return null; + } +} + +/** + * Extract session ID from a parsed JSON event + */ +export function extractSessionId(event: StreamJsonEvent): string | null { + if ("sessionID" in event && typeof event.sessionID === "string") { + return event.sessionID; + } + if ("sessionId" in event && typeof event.sessionId === "string") { + return event.sessionId; + } + if ("session_id" in event && typeof event.session_id === "string") { + return event.session_id; + } + return null; +} diff --git a/cli/src/utils/sanitization.ts b/cli/src/utils/sanitization.ts new file mode 100644 index 00000000..d9ada461 --- /dev/null +++ b/cli/src/utils/sanitization.ts @@ -0,0 +1,49 @@ +/** + * Sanitization utilities for removing sensitive data + * + * SECURITY: All patterns use bounded quantifiers to prevent ReDoS attacks + */ + +/** + * Maximum input length for secret sanitization to prevent ReDoS + */ +const MAX_SANITIZE_INPUT_LENGTH = 1000000; // 1MB + +/** + * Sanitize sensitive data (API keys, passwords, etc.) from string input + * + * SECURITY NOTE: This function includes protections against ReDoS attacks: + * - Input length is limited to MAX_SANITIZE_INPUT_LENGTH + * - All regex patterns use bounded quantifiers (e.g., {48}, {36}) + * - Patterns are applied sequentially with early exit if input becomes too large + * + * @param input - The string to sanitize + * @returns Sanitized string with secrets redacted + */ +export function sanitizeSecrets(input: string): string { + // Limit input length to prevent ReDoS attacks + if (input.length > MAX_SANITIZE_INPUT_LENGTH) { + // For very large inputs, truncate and add warning + const truncated = input.slice(0, MAX_SANITIZE_INPUT_LENGTH); + return `${truncated}\n\n[WARNING: Content truncated due to size limits during secret sanitization]`; + } + + // All patterns use bounded quantifiers to prevent ReDoS + // Patterns are designed to match specific token formats with fixed lengths + const patterns = [ + { regex: /sk-[a-zA-Z0-9]{48}/g, replacement: "[API_KEY_REDACTED]" }, + { regex: /ghp_[a-zA-Z0-9]{36}/g, replacement: "[GITHUB_TOKEN_REDACTED]" }, + { regex: /gho_[a-zA-Z0-9]{52}/g, replacement: "[GITHUB_OAUTH_REDACTED]" }, + { regex: /AKIA[0-9A-Z]{16}/g, replacement: "[AWS_KEY_REDACTED]" }, + // For hex secrets, use a bounded length and require word boundaries to prevent + // matching large hex strings that could cause performance issues + { regex: /\b[0-9a-f]{64}\b/g, replacement: "[HEX_SECRET_REDACTED]" }, + ]; + + let result = input; + for (const { regex, replacement } of patterns) { + result = result.replace(regex, replacement); + } + + return result; +}