Skip to content

Commit 3036e57

Browse files
patel-lyzrclaude
andcommitted
feat(agentos): Agent Simulation Engine — multi-judge evals + overview dashboard
Adds an eval/simulation framework to the AgentOS Console for grading registered agents against test suites, with results surfaced in an observability-style dashboard. Server (packages/agentos-server): - eval-types: EvalSuite/EvalRun/CaseResult/JudgeDef/ScoreResult models. - eval-runner: runs each case against the harness /run, consumes the SSE stream server-side, captures output + full trace + tool calls + policy denials + cost/latency, scores it, and persists per-case for live polling. - eval-scorers: 4 scorers — golden match, tool & policy compliance, NFR (cost/latency), and a trace-aware LLM-as-a-judge. Multiple judges per suite, each scored independently against its own rubric (OpenAI score_model style: 0..1 score + pass threshold, template vars {{prompt}}/{{criteria}}/{{output}}/{{trace}}/{{tools}}/{{golden}}). A case passes only when every enabled scorer + judge passes. - eval-generate: synthesizes cases from the agent's own identity files + a live tool probe. - routes/evals: suite CRUD, run trigger, run readback, case generation. - mongo/index: eval_suites + eval_runs collections; router mounted. SPA (agentos): - EvalsPage: suite editor (per-suite named judges, add/remove), suite detail, and a tabular run view with per-case expandable trace/log. - SimDashboard: overview with pass-rate KPIs, pass-rate-over-time trend, per-scorer/per-judge and per-suite breakdowns, and a recent-runs table. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 221c3c5 commit 3036e57

11 files changed

Lines changed: 2215 additions & 1 deletion

File tree

agentos/src/App.tsx

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
import { Home as HomeIcon, Activity, Shield, Boxes } from "lucide-react";
1+
import { Home as HomeIcon, Activity, Shield, Boxes, FlaskConical } from "lucide-react";
22
import { NavLink, Navigate, Outlet, Route, Routes, useLocation, useNavigate } from "react-router-dom";
33
import { HomePage } from "./components/HomePage.tsx";
44
import { PoliciesPage } from "./components/PoliciesPage.tsx";
5+
import { EvalsPage } from "./components/EvalsPage.tsx";
56
import { ObservabilityTab } from "./components/observability/ObservabilityTab.tsx";
67
import { RegistryPage } from "./components/RegistryPage.tsx";
78
import { AgentDashboard } from "./components/AgentDashboard.tsx";
@@ -18,6 +19,7 @@ export default function App() {
1819
<Route path="registry" element={<RegistryRoute />} />
1920
<Route path="observability" element={<ObservabilityTab />} />
2021
<Route path="policies" element={<PoliciesPage />} />
22+
<Route path="evals" element={<EvalsPage />} />
2123
<Route path="agents/:name" element={<AgentDashboard />} />
2224
<Route path="*" element={<Navigate to="/home" replace />} />
2325
</Route>
@@ -49,6 +51,7 @@ function Layout() {
4951
<RailLink to="/registry" icon={Boxes} label="Agent Registry" active={registryActive} />
5052
<RailLink to="/observability" icon={Activity} label="Observability" />
5153
<RailLink to="/policies" icon={Shield} label="Policies" />
54+
<RailLink to="/evals" icon={FlaskConical} label="Agent Simulation Engine" />
5255
</nav>
5356

5457
<div className="flex-1" />

agentos/src/api.ts

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,95 @@ async function reqJSON<T>(method: string, path: string, body?: unknown): Promise
244244
return r.json() as Promise<T>;
245245
}
246246

247+
// ── Evals ──────────────────────────────────────────────────────────────────
248+
export interface GoldenExpectation {
249+
mode: "exact" | "contains" | "regex";
250+
value: string;
251+
}
252+
export interface EvalCase {
253+
id: string;
254+
prompt: string;
255+
criteria?: string;
256+
golden?: GoldenExpectation;
257+
expectedTools?: string[];
258+
forbiddenTools?: string[];
259+
maxCostUsd?: number;
260+
maxLatencyMs?: number;
261+
}
262+
export interface ScorerConfig {
263+
taskSuccess: boolean;
264+
toolCompliance: boolean;
265+
golden: boolean;
266+
nfr: boolean;
267+
}
268+
export interface EvalSuite {
269+
_id: string;
270+
name: string;
271+
description?: string;
272+
agentName: string;
273+
cases: EvalCase[];
274+
scorers: ScorerConfig;
275+
judges?: JudgeDef[];
276+
// legacy single-judge fields (read-only back-compat)
277+
judgeModel?: string;
278+
judgePrompt?: string;
279+
judgePassThreshold?: number;
280+
passThreshold?: number;
281+
createdAt?: string;
282+
updatedAt?: string;
283+
}
284+
export interface JudgeDef {
285+
id: string;
286+
name: string;
287+
rubric?: string;
288+
model?: string;
289+
passThreshold?: number;
290+
}
291+
export type EvalSuiteInput = Omit<EvalSuite, "_id" | "createdAt" | "updatedAt">;
292+
export interface ScoreResult {
293+
scorer: "taskSuccess" | "toolCompliance" | "golden" | "nfr";
294+
label?: string;
295+
passed: boolean;
296+
score?: number;
297+
detail?: string;
298+
}
299+
export interface PolicyDenial {
300+
tool: string;
301+
reason: string;
302+
}
303+
export interface EvalTraceEntry {
304+
type: "thinking" | "text" | "tool_use" | "tool_result";
305+
text?: string;
306+
tool?: string;
307+
input?: unknown;
308+
isError?: boolean;
309+
}
310+
export interface CaseResult {
311+
caseId: string;
312+
prompt: string;
313+
output: string;
314+
toolCalls: string[];
315+
policyDenials: PolicyDenial[];
316+
transcript: EvalTraceEntry[];
317+
costUsd: number;
318+
latencyMs: number;
319+
scores: ScoreResult[];
320+
passed: boolean;
321+
error?: string;
322+
}
323+
export interface EvalRun {
324+
_id: string;
325+
suiteId: string;
326+
suiteName: string;
327+
agentName: string;
328+
status: "running" | "completed" | "failed";
329+
startedAt: string;
330+
completedAt?: string;
331+
results: CaseResult[];
332+
summary: { total: number; passed: number; passRate: number; gatePassed?: boolean };
333+
error?: string;
334+
}
335+
247336
export const api = {
248337
agents: () => getJSON<{ agents: Agent[] }>("/agents").then((d) => d.agents),
249338
registerAgent: (input: RegisterAgentInput) =>
@@ -303,4 +392,20 @@ export const api = {
303392
reqJSON<OPAPolicyDoc | { success?: boolean }>("PUT", `/opa-policies/${encodeURIComponent(id)}`, body),
304393
deleteOpaPolicy: (id: string) =>
305394
reqJSON<{ success?: boolean }>("DELETE", `/opa-policies/${encodeURIComponent(id)}`),
395+
396+
// Evals — suite CRUD + run trigger + run readback.
397+
evals: {
398+
listSuites: () => getJSON<{ suites: EvalSuite[] }>("/evals/suites").then((d) => d.suites),
399+
getSuite: (id: string) => getJSON<EvalSuite>(`/evals/suites/${encodeURIComponent(id)}`),
400+
createSuite: (body: EvalSuiteInput) => postJSON<EvalSuite>("/evals/suites", body),
401+
updateSuite: (id: string, body: EvalSuiteInput) =>
402+
reqJSON<EvalSuite>("PUT", `/evals/suites/${encodeURIComponent(id)}`, body),
403+
deleteSuite: (id: string) => reqJSON<{ ok: boolean }>("DELETE", `/evals/suites/${encodeURIComponent(id)}`),
404+
runSuite: (id: string) => postJSON<{ runId: string }>(`/evals/suites/${encodeURIComponent(id)}/run`, {}),
405+
generateCases: (agentName: string, count: number, focus?: string) =>
406+
postJSON<{ cases: EvalCase[] }>("/evals/generate", { agentName, count, focus }).then((d) => d.cases),
407+
listRuns: (suiteId?: string) =>
408+
getJSON<{ runs: EvalRun[] }>(`/evals/runs${suiteId ? `?suite=${encodeURIComponent(suiteId)}` : ""}`).then((d) => d.runs),
409+
getRun: (id: string) => getJSON<EvalRun>(`/evals/runs/${encodeURIComponent(id)}`),
410+
},
306411
};

0 commit comments

Comments
 (0)