diff --git a/ITBench b/ITBench new file mode 160000 index 00000000..29544a54 --- /dev/null +++ b/ITBench @@ -0,0 +1 @@ +Subproject commit 29544a547e77e862dbc450e0f3c1ec8b3935674a diff --git a/src/agentbox/http-server.ts b/src/agentbox/http-server.ts index 22553c16..97305540 100644 --- a/src/agentbox/http-server.ts +++ b/src/agentbox/http-server.ts @@ -321,7 +321,11 @@ export function createHttpServer(sessionManager: AgentBoxSessionManager): http.S // Subscribe to buffer events so SSE can replay them even if it connects late const brainUnsub = managed.brain.subscribe((event) => { if (!managed._promptDone) { - managed._eventBuffer.push(event); + // Stamp with server time when emitted so replayed events have accurate timestamps + const tsEvent = typeof event === "object" && event !== null + ? { ...(event as object), ts: Date.now() } + : event; + managed._eventBuffer.push(tsEvent); } // Null dpState.checklist when deep_search completes — this is the exit signal // for the SDK brain's auto-continue loop in claude-sdk-brain.ts. @@ -343,6 +347,7 @@ export function createHttpServer(sessionManager: AgentBoxSessionManager): http.S type: "tool_progress", toolName: "deep_search", progress: event, + ts: Date.now(), }); } // Sync phase events to SDK brain's dpState so the auto-continue loop @@ -551,7 +556,11 @@ export function createHttpServer(sessionManager: AgentBoxSessionManager): http.S if (closed || res.writableEnded) return; try { sseEventCount++; - const data = JSON.stringify(event); + // Add server timestamp if not already present (buffered events carry their original ts) + const out = typeof event === "object" && event !== null && !("ts" in (event as object)) + ? { ...(event as object), ts: Date.now() } + : event; + const data = JSON.stringify(out); res.write(`data: ${data}\n\n`); } catch (err) { console.warn(`[agentbox-http] SSE write error for session ${sessionId}:`, err); diff --git a/src/gateway/config.ts b/src/gateway/config.ts index 7eae3cbc..f619add4 100644 --- a/src/gateway/config.ts +++ b/src/gateway/config.ts @@ -1,3 +1,6 @@ +import fs from "node:fs"; +import path from "node:path"; + export interface ChannelConfig { enabled: boolean; [key: string]: unknown; @@ -23,5 +26,15 @@ const DEFAULT_CONFIG: GatewayConfig = { }; export function loadGatewayConfig(): GatewayConfig { + try { + // Read port from shared settings.json so one file controls everything + const configPath = process.env.SICLAW_CONFIG_DIR + ? path.resolve(process.env.SICLAW_CONFIG_DIR, "settings.json") + : path.resolve(process.cwd(), ".siclaw", "config", "settings.json"); + const raw = JSON.parse(fs.readFileSync(configPath, "utf-8")) as { server?: { port?: number } }; + if (raw?.server?.port) { + return { ...DEFAULT_CONFIG, port: raw.server.port }; + } + } catch { /* fall through to default */ } return { ...DEFAULT_CONFIG }; } diff --git a/src/gateway/web/src/hooks/usePilot.ts b/src/gateway/web/src/hooks/usePilot.ts index 56ed111d..b91e8ae1 100644 --- a/src/gateway/web/src/hooks/usePilot.ts +++ b/src/gateway/web/src/hooks/usePilot.ts @@ -34,6 +34,14 @@ export interface PilotMessage { isStreaming?: boolean; /** Hidden from chat bubbles (e.g. update_plan tool messages) */ hidden?: boolean; + /** performance.now() when tool_execution_start was received — drives live stopwatch */ + startedAt?: number; + /** Elapsed ms from tool_execution_start to tool_execution_end */ + durationMs?: number; + /** Elapsed ms for LLM thinking (message_start to message_end) — on assistant messages */ + llmDurationMs?: number; + /** ms from last anchor event (tool_execution_end or send) to message_start — TTFT approximation */ + waitMs?: number; } export interface Session { @@ -155,6 +163,17 @@ function reduceInvestigationProgress( } const SESSION_KEY_STORAGE = 'siclaw_current_session'; + +const TIMING_MAX_SAMPLES = 200; +function appendTimingSample(key: string, ms: number): void { + try { + const arr = JSON.parse(localStorage.getItem(key) ?? '[]') as number[]; + arr.push(Math.round(ms)); + if (arr.length > TIMING_MAX_SAMPLES) arr.splice(0, arr.length - TIMING_MAX_SAMPLES); + localStorage.setItem(key, JSON.stringify(arr)); + window.dispatchEvent(new CustomEvent('siclaw_timing_update')); + } catch { /* ignore storage errors */ } +} const SESSION_WORKSPACE_STORAGE = 'siclaw_session_workspace'; const SELECTED_BRAIN_STORAGE = 'siclaw_selected_brain'; @@ -300,6 +319,20 @@ export function usePilot() { // The actual brain type of the current active session (from backend), null = no session / unknown const [sessionBrainType, setSessionBrainType] = useState(null); + // Timing: performance.now() when the current prompt was sent (drives ThinkingIndicator stopwatch) + const [loadingStartedAt, setLoadingStartedAt] = useState(null); + // Ref tracks performance.now() of last message_start for LLM duration calculation + const llmStartRef = useRef(0); + // When LLM only emits tool calls (no text), llmDurationMs has nowhere to attach — + // park it here until the next tool_execution_start claims it + const pendingLlmDurationMsRef = useRef(0); + // Server-side Date.now() of last anchor event (sendMessage or tool_execution_end) — for TTFT calc + const lastServerTsRef = useRef(0); + // Parked TTFT value (message_start.ts - lastServerTsRef) awaiting attachment to a message + const pendingWaitMsRef = useRef(0); + + // Ref for sendRpc — handleWsMessage has [] deps and can't close over sendRpc directly + const sendRpcRef = useRef<((method: string, params?: Record) => Promise) | null>(null); // Ref to allow loadSessions calls from event handler without stale closures const loadSessionsRef = useRef<() => void>(() => {}); // Ref for fetching context usage from event handler @@ -415,6 +448,14 @@ export function usePilot() { setInvestigationProgress(prev => prev ?? { hypotheses: [] }); // Checklist creation now handled by dp_status event from gateway. } + // Claim pending LLM thinking duration (set when LLM only emitted tool calls, no text) + const thinkMs = pendingLlmDurationMsRef.current || undefined; + pendingLlmDurationMsRef.current = 0; + // Claim pending TTFT (set at message_start when LLM only emitted tool calls) + const waitMs = pendingWaitMsRef.current || undefined; + pendingWaitMsRef.current = 0; + if (waitMs != null) appendTimingSample('siclaw_timing_ttft', waitMs); + if (thinkMs != null) appendTimingSample('siclaw_timing_llm', thinkMs); // end_investigation cleanup now driven by dp_status "completed" event from gateway. setMessages(prev => [...prev, { id: `tool-${Date.now()}`, @@ -426,11 +467,16 @@ export function usePilot() { timestamp: new Date().toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' }), isStreaming: true, hidden, + startedAt: performance.now(), + llmDurationMs: thinkMs, + waitMs, }]); break; } case 'tool_execution_end': { + // Update anchor timestamp for next TTFT calculation + if (payload.ts) lastServerTsRef.current = payload.ts as number; const result = payload.result as { content?: Array<{ type: string; text?: string }>; details?: Record } | undefined; const resultText = result?.content ?.filter((c: { type: string }) => c.type === 'text') @@ -443,6 +489,12 @@ export function usePilot() { // Check toolName before entering setMessages to avoid side effects // inside the state updater (React StrictMode calls updaters twice). const endedToolName = payload.toolName as string | undefined; + const endPerfNow = performance.now(); + // Capture timing from the running tool BEFORE setMessages (messagesRef is last committed state) + const runningTool = [...messagesRef.current].reverse().find(m => m.role === 'tool' && m.isStreaming); + const durationMsSnap = runningTool?.startedAt != null ? Math.round(endPerfNow - runningTool.startedAt) : undefined; + const toolWaitMs = runningTool?.waitMs; + const toolLlmDurationMs = runningTool?.llmDurationMs; // When deep_search completes, mark all remaining checklist items // as done and auto-clear after 3s. This replaces the old // manage_checklist(conclusion=done) trigger. @@ -463,6 +515,9 @@ export function usePilot() { setMessages(prev => { const last = prev[prev.length - 1]; if (last?.role === 'tool' && last.isStreaming) { + const durationMs = last.startedAt != null + ? Math.round(endPerfNow - last.startedAt) + : undefined; return [ ...prev.slice(0, -1), { @@ -470,6 +525,7 @@ export function usePilot() { content: resultText, toolStatus: isError ? 'error' as const : 'success' as const, isStreaming: false, + durationMs, ...(toolDetails ? { toolDetails } : {}), ...(dbMessageId ? { id: dbMessageId } : {}), } @@ -477,6 +533,17 @@ export function usePilot() { } return prev; }); + // Collect tool execution time sample for dashboard statistics + if (durationMsSnap != null) appendTimingSample('siclaw_timing_tool', durationMsSnap); + // Persist timing + toolStatus to DB via metadata so they survive navigation and session reload + if (dbMessageId && sendRpcRef.current) { + const meta: Record = {}; + meta.toolStatus = isError ? 'error' : 'success'; + if (durationMsSnap != null) meta.durationMs = durationMsSnap; + if (toolWaitMs != null) meta.waitMs = toolWaitMs; + if (toolLlmDurationMs != null) meta.llmDurationMs = toolLlmDurationMs; + sendRpcRef.current('message.updateMeta', { id: dbMessageId, metadata: meta }).catch(() => {}); + } break; } @@ -515,6 +582,13 @@ export function usePilot() { } case 'message_start': { + llmStartRef.current = performance.now(); + // Compute TTFT: server ts of this event minus server ts of last anchor + if (payload.ts && lastServerTsRef.current > 0) { + pendingWaitMsRef.current = Math.max(0, Math.round((payload.ts as number) - lastServerTsRef.current)); + } else { + pendingWaitMsRef.current = 0; + } const msg = payload.message as { role?: string; customType?: string; details?: Record; content?: string | Array<{ type: string; text?: string }> } | undefined; // Show steer (user) messages injected mid-conversation. @@ -542,6 +616,36 @@ export function usePilot() { case 'message_end': { const endMsg = payload.message as { role?: string; toolName?: string; details?: Record } | undefined; + // Stamp LLM duration + TTFT onto the last streaming assistant message. + // If LLM only emitted tool calls (no text), no streaming assistant message exists — + // park both in pending refs so tool_execution_start can claim them. + if (endMsg?.role === 'assistant' && llmStartRef.current > 0) { + const llmDurationMs = Math.round(performance.now() - llmStartRef.current); + llmStartRef.current = 0; + const waitMs = pendingWaitMsRef.current || undefined; + pendingWaitMsRef.current = 0; + const hasStreamingAssistant = messagesRef.current.some( + m => m.role === 'assistant' && m.isStreaming + ); + if (hasStreamingAssistant) { + if (waitMs != null) appendTimingSample('siclaw_timing_ttft', waitMs); + appendTimingSample('siclaw_timing_llm', llmDurationMs); + setMessages(prev => { + for (let i = prev.length - 1; i >= 0; i--) { + if (prev[i].role === 'assistant' && prev[i].isStreaming) { + const updated = [...prev]; + updated[i] = { ...prev[i], llmDurationMs, waitMs }; + return updated; + } + } + return prev; + }); + } else { + pendingLlmDurationMsRef.current = llmDurationMs; + // waitMs stays in pendingWaitMsRef — already cleared above, restore it + pendingWaitMsRef.current = waitMs ?? 0; + } + } if (endMsg?.role === 'toolResult' && endMsg.details && Object.keys(endMsg.details).length > 0) { // Pi-agent brain: tool result details arrive via message_end (not tool_execution_end). // Backfill toolDetails onto the matching tool message. @@ -601,6 +705,7 @@ export function usePilot() { // During abort, don't unlock here — abortResponse will do it after RPC completes if (!isAbortingRef.current) { setIsLoading(false); + setLoadingStartedAt(null); } setPendingMessages([]); loadSessionsRef.current(); @@ -608,6 +713,33 @@ export function usePilot() { loadModelsRef.current(); fetchModelRef.current(); + // Persist timing for assistant messages — they have no dbMessageId during streaming, + // so we fetch the just-saved DB messages and match by role+content to update metadata. + if (!isAbortingRef.current && sendRpcRef.current && currentSessionKeyRef.current) { + const sessionId = currentSessionKeyRef.current; + const rpc = sendRpcRef.current; + const assistantsToSave = messagesRef.current.filter(m => + m.role === 'assistant' && (m.llmDurationMs != null || m.waitMs != null) + ); + if (assistantsToSave.length > 0) { + setTimeout(async () => { + try { + const res = await rpc<{ messages: PilotMessage[] }>('chat.history', { sessionId }); + const dbMessages = res.messages ?? []; + for (const m of assistantsToSave) { + const match = dbMessages.find(d => d.role === 'assistant' && d.content === m.content); + if (match) { + const meta: Record = {}; + if (m.llmDurationMs != null) meta.llmDurationMs = m.llmDurationMs; + if (m.waitMs != null) meta.waitMs = m.waitMs; + await rpc('message.updateMeta', { id: match.id, metadata: meta }); + } + } + } catch { /* best-effort */ } + }, 800); + } + } + // DP checklist completion now handled by dp_status "completed" event from gateway. // No safety-net needed — gateway emits dp_status on agent_end when status is concluding. break; @@ -688,7 +820,8 @@ export function usePilot() { } }, [isConnected, sendRpc, workspaceId]); - // Keep ref in sync + // Keep refs in sync + sendRpcRef.current = sendRpc; loadSessionsRef.current = loadSessions; loadModelsRef.current = loadModels; fetchModelRef.current = fetchCurrentModel; @@ -718,6 +851,11 @@ export function usePilot() { const mapMessages = (raw: PilotMessage[]) => raw.map(m => ({ ...m, + // Restore timing + toolStatus from metadata (persisted at runtime; absent on first load before any conversation) + toolStatus: m.toolStatus ?? (m.metadata?.toolStatus as ToolStatus | undefined) ?? (m.role === 'tool' && m.content ? 'success' as ToolStatus : undefined), + durationMs: m.durationMs ?? (m.metadata?.durationMs as number | undefined), + llmDurationMs: m.llmDurationMs ?? (m.metadata?.llmDurationMs as number | undefined), + waitMs: m.waitMs ?? (m.metadata?.waitMs as number | undefined), toolInput: m.role === 'tool' && m.toolInput ? parseToolInput(m.toolName ?? '', m.toolInput) : undefined, @@ -811,6 +949,9 @@ export function usePilot() { }; setMessages(prev => [...prev, userMsg]); setIsLoading(true); + setLoadingStartedAt(performance.now()); + lastServerTsRef.current = Date.now(); + pendingWaitMsRef.current = 0; try { const result = await sendRpc<{ sessionId: string; brainType?: BrainType }>('chat.send', { @@ -1278,5 +1419,6 @@ export function usePilot() { selectedBrain, selectBrain, sessionBrainType, + loadingStartedAt, }; } diff --git a/src/gateway/web/src/pages/Metrics/DashboardTab.tsx b/src/gateway/web/src/pages/Metrics/DashboardTab.tsx index 58c3c0dd..8288b850 100644 --- a/src/gateway/web/src/pages/Metrics/DashboardTab.tsx +++ b/src/gateway/web/src/pages/Metrics/DashboardTab.tsx @@ -6,6 +6,7 @@ import { SessionsChart } from './SessionsChart'; import { ToolCallsPanel } from './ToolCallsPanel'; import { SkillCallsPanel } from './SkillCallsPanel'; import { CumulativePanel } from './CumulativePanel'; +import { TimingStatsPanel } from './TimingStatsPanel'; interface DashboardTabProps { data: TimeseriesResponse | null; @@ -48,7 +49,10 @@ export function DashboardTab({ data, range, loading }: DashboardTabProps) { {/* Row 4: Sessions & Connections (full width) */} - {/* Row 5: Cumulative Statistics (full width) */} + {/* Row 5: Response Timing Distribution (full width) */} + + + {/* Row 6: Cumulative Statistics (full width) */} diff --git a/src/gateway/web/src/pages/Metrics/TimingStatsPanel.tsx b/src/gateway/web/src/pages/Metrics/TimingStatsPanel.tsx new file mode 100644 index 00000000..a35c68b1 --- /dev/null +++ b/src/gateway/web/src/pages/Metrics/TimingStatsPanel.tsx @@ -0,0 +1,169 @@ +import { useState, useEffect } from 'react'; +import { BarChart, Bar, XAxis, YAxis, CartesianGrid, Tooltip, ResponsiveContainer, Cell } from 'recharts'; + +function readSamples(key: string): number[] { + try { return JSON.parse(localStorage.getItem(key) ?? '[]') as number[]; } catch { return []; } +} + +function computeStats(samples: number[]) { + if (samples.length === 0) return null; + const sorted = [...samples].sort((a, b) => a - b); + const n = sorted.length; + const pct = (p: number) => sorted[Math.min(Math.ceil(n * p) - 1, n - 1)]; + return { + min: sorted[0], + avg: Math.round(samples.reduce((s, v) => s + v, 0) / n), + p95: pct(0.95), + p99: pct(0.99), + max: sorted[n - 1], + count: n, + }; +} + +function fmt(ms: number | undefined): string { + if (ms == null) return '—'; + if (ms < 1000) return `${ms}ms`; + return `${(ms / 1000).toFixed(2)}s`; +} + +type StatKey = 'min' | 'avg' | 'p95' | 'p99' | 'max'; +const STAT_ROWS: { key: StatKey; label: string }[] = [ + { key: 'min', label: 'MIN' }, + { key: 'avg', label: 'AVG' }, + { key: 'p95', label: 'P95' }, + { key: 'p99', label: 'P99' }, + { key: 'max', label: 'MAX' }, +]; + +const METRICS = [ + { key: 'siclaw_timing_ttft', label: 'TTFT', color: '#6366f1' }, + { key: 'siclaw_timing_llm', label: 'Thinking', color: '#f59e0b' }, + { key: 'siclaw_timing_tool', label: 'Tool Exec', color: '#10b981' }, +] as const; + +export function TimingStatsPanel() { + const [tick, setTick] = useState(0); + + useEffect(() => { + const handler = () => setTick(t => t + 1); + window.addEventListener('siclaw_timing_update', handler); + return () => window.removeEventListener('siclaw_timing_update', handler); + }, []); + + const stats = METRICS.map(m => ({ + ...m, + samples: tick >= 0 ? readSamples(m.key) : [], + stat: null as ReturnType, + })).map(m => ({ ...m, stat: computeStats(m.samples) })); + + const totalSamples = Math.max(...stats.map(m => m.samples.length)); + const hasData = totalSamples > 0; + + const handleClear = () => { + METRICS.forEach(m => localStorage.removeItem(m.key)); + setTick(t => t + 1); + }; + + // Bar chart data: one bar per metric, height = avg + const barData = stats.map(m => ({ + name: m.label, + avg: m.stat?.avg ?? 0, + color: m.color, + })); + + return ( +
+
+
+

Response Timing Statistics

+

+ TTFT, thinking, and tool execution times + {hasData ? ` — last ${totalSamples} / 200 calls` : ''} +

+
+ {hasData && ( + + )} +
+ + {!hasData ? ( +
+ No data yet — start a conversation to collect timing samples +
+ ) : ( +
+ {/* Left: bar chart comparing avg of each metric */} +
+

Average comparison

+ + + + + fmt(v)} + tick={{ fontSize: 10 }} + width={56} + axisLine={false} + tickLine={false} + /> + [fmt(v as number), 'Average']} + contentStyle={{ fontSize: 12, borderRadius: 6 }} + cursor={{ fill: '#f9fafb' }} + /> + + {barData.map((entry, i) => ( + + ))} + + + +
+ + {/* Right: stats table */} +
+

Percentiles

+ + + + + {stats.map(m => ( + + ))} + + + + {STAT_ROWS.map(({ key, label }) => ( + + + {stats.map(m => ( + + ))} + + ))} + + + {stats.map(m => ( + + ))} + + +
+ {m.label} +
{label} + {fmt(m.stat?.[key])} +
n + {m.stat?.count ?? 0} +
+
+
+ )} +
+ ); +} diff --git a/src/gateway/web/src/pages/Pilot/components/PilotArea.tsx b/src/gateway/web/src/pages/Pilot/components/PilotArea.tsx index 6839d1f1..9ec33f2f 100644 --- a/src/gateway/web/src/pages/Pilot/components/PilotArea.tsx +++ b/src/gateway/web/src/pages/Pilot/components/PilotArea.tsx @@ -60,6 +60,8 @@ export interface PilotAreaProps { /** Current workspace ID for cron job operations */ selectedWorkspaceId?: string | null; isAdmin?: boolean; + /** performance.now() when the current prompt was sent — drives the top-level stopwatch */ + loadingStartedAt?: number | null; } /** Compute superseded status for schedule messages */ @@ -108,7 +110,7 @@ function computeScheduleStatuses(messages: PilotMessage[]): Map(null); const scrollContainerRef = useRef(null); const prevScrollHeightRef = useRef(0); @@ -501,7 +503,7 @@ export function PilotArea({ messages, isLoading, isLoadingHistory, wsStatus, isC )} - {isLoading && } + {isLoading && } {showFeedbackHint && (
{ const interval = setInterval(() => { @@ -551,6 +554,12 @@ function ThinkingIndicator() { return () => clearInterval(interval); }, []); + useEffect(() => { + if (!startedAt) return; + const tick = setInterval(() => setElapsed(Math.floor((performance.now() - startedAt) / 1000)), 1000); + return () => clearInterval(tick); + }, [startedAt]); + return (
@@ -564,6 +573,9 @@ function ThinkingIndicator() { )}> {THINKING_TIPS[tipIndex]} + {startedAt != null && elapsed > 0 && ( + {elapsed}s + )}
); @@ -761,6 +773,12 @@ function MessageItem({ message, scheduleStatus, onOpenSchedulePanel, onOpenSkill {message.isStreaming && !isUser && ( )} + {!message.isStreaming && !isUser && message.waitMs != null && message.waitMs > 100 && ( + ⏳{formatDuration(message.waitMs)} + )} + {!message.isStreaming && !isUser && message.llmDurationMs != null && ( + 💭{formatDuration(message.llmDurationMs)} + )}
{/* Reference chips (user messages only) */} @@ -828,12 +846,38 @@ function MessageItem({ message, scheduleStatus, onOpenSchedulePanel, onOpenSkill ); } +function ToolItemTimer({ startedAt }: { startedAt: number }) { + const [elapsed, setElapsed] = useState(0); + useEffect(() => { + const tick = setInterval(() => setElapsed(Math.floor((performance.now() - startedAt) / 1000)), 1000); + return () => clearInterval(tick); + }, [startedAt]); + return {elapsed}s; +} + +function formatDuration(ms: number): string { + if (ms < 1000) return `${ms}ms`; + return `${(ms / 1000).toFixed(1)}s`; +} + function ToolItem({ message }: { message: PilotMessage }) { const [expanded, setExpanded] = useState(false); const isOpen = message.isStreaming || expanded; return (
+ {message.waitMs != null && message.waitMs > 100 && ( +
+ + Waiting {formatDuration(message.waitMs)} +
+ )} + {message.llmDurationMs != null && ( +
+ 💭 + Thinking {formatDuration(message.llmDurationMs)} +
+ )}