diff --git a/dashboard/src/__tests__/AgentEventStream.test.tsx b/dashboard/src/__tests__/AgentEventStream.test.tsx
new file mode 100644
index 00000000..b6686ba9
--- /dev/null
+++ b/dashboard/src/__tests__/AgentEventStream.test.tsx
@@ -0,0 +1,301 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { render, screen, waitFor, act } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+
+vi.mock("@/api/client", () => ({
+  api: {
+    authHeaders: vi.fn(() => ({ "X-API-Key": "test" })),
+    setApiKey: vi.fn(),
+    isMockMode: false,
+    onMockChange: vi.fn(() => () => {}),
+  },
+}));
+
+vi.mock("@/lib/constants", () => ({
+  API_BASE_URL: "http://localhost:8080/api",
+  POLL_INTERVAL: 5000,
+}));
+
+import { AgentEventStream } from "@/components/agents/AgentEventStream";
+
+interface EnqueueController {
+  enqueue: (chunk: string) => void;
+  close: () => void;
+}
+
+function makeSseStream(): { response: Response; controller: EnqueueController } {
+  const encoder = new TextEncoder();
+  let streamController: ReadableStreamDefaultController<Uint8Array> | null = null;
+  const stream = new ReadableStream<Uint8Array>({
+    start(c) {
+      streamController = c;
+    },
+    cancel() {},
+  });
+  const controller: EnqueueController = {
+    enqueue: (chunk: string) => {
+      streamController?.enqueue(encoder.encode(chunk));
+    },
+    close: () => {
+      try {
+        streamController?.close();
+      } catch {
+        /* already closed */
+      }
+    },
+  };
+  const response = new Response(stream, {
+    status: 200,
+    headers: { "Content-Type": "text/event-stream" },
+  });
+  return { response, controller };
+}
+
+function sseLine(payload: Record<string, unknown>): string {
+  return `data: ${JSON.stringify(payload)}\n\n`;
+}
+
+describe("AgentEventStream", () => {
+  let originalFetch: typeof fetch;
+
+  beforeEach(() => {
+    originalFetch = globalThis.fetch;
+  });
+
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+    vi.restoreAllMocks();
+  });
+
+  it("shows empty state when no events have arrived", async () => {
+    const { response } = makeSseStream();
+    globalThis.fetch = vi.fn().mockResolvedValue(response);
+
+    render(<AgentEventStream runId="run-1" />);
+
+    await waitFor(() => {
+      expect(screen.getByTestId("agent-stream-empty")).toBeInTheDocument();
+    });
+  });
+
+  it("renders agent.message event with its text", async () => {
+    const { response, controller } = makeSseStream();
+    globalThis.fetch = vi.fn().mockResolvedValue(response);
+
+    render(<AgentEventStream runId="run-2" />);
+
+    await act(async () => {
+      controller.enqueue(
+        sseLine({
+          type: "agent.message",
+          text: "Hello from the agent",
+          ts: 1700000000000,
+        })
+      );
+    });
+
+    await waitFor(() => {
+      expect(screen.getByText("Hello from the agent")).toBeInTheDocument();
+    });
+    expect(screen.getByTestId("agent-event-message")).toBeInTheDocument();
+  });
+
+  it("renders agent.tool_use with tool name and arguments", async () => {
+    const { response, controller } = makeSseStream();
+    globalThis.fetch = vi.fn().mockResolvedValue(response);
+
+    render(<AgentEventStream runId="run-3" />);
+
+    await act(async () => {
+      controller.enqueue(
+        sseLine({
+          type: "agent.tool_use",
+          toolName: "web_search",
+          input: { query: "anthropic" },
+          toolUseId: "tu_1",
+          ts: 1700000001000,
+        })
+      );
+    });
+
+    await waitFor(() => {
+      expect(screen.getByTestId("agent-event-tool-use")).toBeInTheDocument();
+    });
+    const card = screen.getByTestId("agent-event-tool-use");
+    expect(card.textContent).toContain("web_search");
+    expect(card.textContent).toContain("anthropic");
+  });
+
+  it("renders agent.thinking collapsed by default", async () => {
+    const { response, controller } = makeSseStream();
+    globalThis.fetch = vi.fn().mockResolvedValue(response);
+
+    render(<AgentEventStream runId="run-4" />);
+
+    await act(async () => {
+      controller.enqueue(
+        sseLine({
+          type: "agent.thinking",
+          text: "Reasoning about next step",
+          ts: 1700000002000,
+        })
+      );
+    });
+
+    await waitFor(() => {
+      expect(screen.getByTestId("agent-event-thinking")).toBeInTheDocument();
+    });
+    const button = screen
+      .getByTestId("agent-event-thinking")
+      .querySelector("button");
+    expect(button).not.toBeNull();
+    expect(button!.getAttribute("aria-expanded")).toBe("false");
+    expect(screen.queryByText("Reasoning about next step")).not.toBeInTheDocument();
+  });
+
+  it("expands agent.thinking content on click", async () => {
+    const user = userEvent.setup();
+    const { response, controller } = makeSseStream();
+    globalThis.fetch = vi.fn().mockResolvedValue(response);
+
+    render(<AgentEventStream runId="run-5" />);
+
+    await act(async () => {
+      controller.enqueue(
+        sseLine({
+          type: "agent.thinking",
+          text: "Detailed chain of thought",
+          ts: 1700000003000,
+        })
+      );
+    });
+
+    const thinking = await screen.findByTestId("agent-event-thinking");
+    const toggle = thinking.querySelector("button");
+    expect(toggle).not.toBeNull();
+
+    await user.click(toggle!);
+
+    expect(toggle!.getAttribute("aria-expanded")).toBe("true");
+    expect(screen.getByText("Detailed chain of thought")).toBeInTheDocument();
+  });
+
+  it("applies error styling to agent.tool_result with isError=true", async () => {
+    const { response, controller } = makeSseStream();
+    globalThis.fetch = vi.fn().mockResolvedValue(response);
+
+    render(<AgentEventStream runId="run-6" />);
+
+    await act(async () => {
+      controller.enqueue(
+        sseLine({
+          type: "agent.tool_result",
+          toolUseId: "tu_2",
+          output: "Network timeout",
+          isError: true,
+          ts: 1700000004000,
+        })
+      );
+    });
+
+    const card = await screen.findByTestId("agent-event-tool-result");
+    expect(card.getAttribute("data-error")).toBe("true");
+    expect(card.className).toContain("border-error");
+    expect(card.textContent).toContain("Network timeout");
+  });
+
+  it("updates status chip from running to idle", async () => {
+    const { response, controller } = makeSseStream();
+    globalThis.fetch = vi.fn().mockResolvedValue(response);
+
+    render(<AgentEventStream runId="run-7" />);
+
+    await act(async () => {
+      controller.enqueue(
+        sseLine({ type: "session.status_running", ts: 1700000005000 })
+      );
+    });
+
+    await waitFor(() => {
+      const chip = screen.getByTestId("agent-stream-status");
+      expect(chip.getAttribute("data-tone")).toBe("running");
+    });
+
+    await act(async () => {
+      controller.enqueue(
+        sseLine({
+          type: "session.status_idle",
+          ts: 1700000006000,
+          stopReason: "end_turn",
+        })
+      );
+    });
+
+    await waitFor(() => {
+      const chip = screen.getByTestId("agent-stream-status");
+      expect(chip.getAttribute("data-tone")).toBe("idle");
+      expect(chip.textContent).toContain("end_turn");
+    });
+  });
+
+  it("groups events by threadId when multiple threads are present", async () => {
+    const { response, controller } = makeSseStream();
+    globalThis.fetch = vi.fn().mockResolvedValue(response);
+
+    render(<AgentEventStream runId="run-8" />);
+
+    await act(async () => {
+      controller.enqueue(
+        sseLine({
+          type: "agent.message",
+          threadId: "thread-a",
+          text: "from A",
+          ts: 1700000007000,
+        })
+      );
+      controller.enqueue(
+        sseLine({
+          type: "agent.message",
+          threadId: "thread-b",
+          text: "from B",
+          ts: 1700000008000,
+        })
+      );
+    });
+
+    await waitFor(() => {
+      const threads = screen.getAllByTestId("agent-stream-thread");
+      expect(threads.length).toBe(2);
+    });
+
+    const threads = screen.getAllByTestId("agent-stream-thread");
+    expect(threads[0].getAttribute("data-thread-id")).toBe("thread-a");
+    expect(threads[1].getAttribute("data-thread-id")).toBe("thread-b");
+  });
+
+  it("shows error banner when stream fails", async () => {
+    globalThis.fetch = vi.fn().mockResolvedValue(
+      new Response("boom", { status: 500 })
+    );
+
+    render(<AgentEventStream runId="run-9" />);
+
+    await waitFor(() => {
+      expect(screen.getByTestId("agent-stream-error-banner")).toBeInTheDocument();
+    });
+    const chip = screen.getByTestId("agent-stream-status");
+    expect(chip.getAttribute("data-tone")).toBe("error");
+  });
+
+  it("falls back to unavailable state on 404", async () => {
+    globalThis.fetch = vi.fn().mockResolvedValue(
+      new Response("not found", { status: 404 })
+    );
+
+    render(<AgentEventStream runId="run-10" />);
+
+    await waitFor(() => {
+      expect(screen.getByText(/not available/i)).toBeInTheDocument();
+    });
+  });
+});
diff --git a/dashboard/src/components/agents/AgentEventCard.tsx b/dashboard/src/components/agents/AgentEventCard.tsx
new file mode 100644
index 00000000..3d5d2908
--- /dev/null
+++ b/dashboard/src/components/agents/AgentEventCard.tsx
@@ -0,0 +1,176 @@
+import { useState } from "react";
+import { Brain, Wrench, MessageCircle, AlertOctagon, ChevronRight, Activity } from "lucide-react";
+import { CopyButton } from "@/components/shared/CopyButton";
+import { cn } from "@/lib/utils";
+import type { AgentEvent } from "@/types/agentEvents";
+
+interface AgentEventCardProps {
+  event: AgentEvent;
+}
+
+function formatTs(ts: number): string {
+  try {
+    return new Date(ts).toLocaleTimeString();
+  } catch {
+    return "";
+  }
+}
+
+export function AgentEventCard({ event }: AgentEventCardProps) {
+  const [thinkingOpen, setThinkingOpen] = useState(false);
+
+  switch (event.type) {
+    case "agent.message":
+      return (
+        <div
+          data-testid="agent-event-message"
+          className="rounded-lg border border-border bg-surface px-3 py-2 shadow-sm"
+        >
+          <div className="flex items-center gap-2 text-[10px] uppercase tracking-wider text-muted-foreground">
+            <MessageCircle className="h-3 w-3" />
+            <span>Message</span>
+            <span className="ml-auto font-mono text-muted">{formatTs(event.ts)}</span>
+          </div>
+          <p className="mt-1 whitespace-pre-wrap text-sm text-foreground">{event.text}</p>
+        </div>
+      );
+
+    case "agent.thinking":
+      return (
+        <div
+          data-testid="agent-event-thinking"
+          className="rounded-lg border border-border bg-background/50 shadow-sm"
+        >
+          <button
+            type="button"
+            onClick={() => setThinkingOpen((o) => !o)}
+            aria-expanded={thinkingOpen}
+            className="flex w-full items-center gap-2 px-3 py-2 text-left text-xs font-medium text-muted hover:text-foreground transition-colors"
+          >
+            <ChevronRight
+              className={cn(
+                "h-3.5 w-3.5 transition-transform duration-200",
+                thinkingOpen && "rotate-90"
+              )}
+            />
+            <Brain className="h-3.5 w-3.5 text-accent" />
+            <span>Thinking</span>
+            <span className="ml-auto font-mono text-[10px] text-muted-foreground">
+              {formatTs(event.ts)}
+            </span>
+          </button>
+          {thinkingOpen && (
+            <p className="border-t border-border px-3 py-2 text-xs italic text-muted whitespace-pre-wrap">
+              {event.text}
+            </p>
+          )}
+        </div>
+      );
+
+    case "agent.tool_use": {
+      const args = JSON.stringify(event.input);
+      const formatted = `${event.toolName}(${args})`;
+      return (
+        <div
+          data-testid="agent-event-tool-use"
+          className="rounded-lg border border-accent/30 bg-accent/5 px-3 py-2 shadow-sm"
+        >
+          <div className="flex items-center gap-2 text-[10px] uppercase tracking-wider text-accent">
+            <Wrench className="h-3 w-3" />
+            <span>Tool use</span>
+            <span className="ml-auto font-mono text-muted">{formatTs(event.ts)}</span>
+            <CopyButton value={formatted} label="tool call" />
+          </div>
+          <pre className="mt-1 overflow-x-auto rounded bg-background px-2 py-1 font-mono text-xs text-foreground">
+            <span className="text-accent">{event.toolName}</span>
+            <span className="text-muted">(</span>
+            <span>{args}</span>
+            <span className="text-muted">)</span>
+          </pre>
+        </div>
+      );
+    }
+
+    case "agent.tool_result":
+      return (
+        <div
+          data-testid="agent-event-tool-result"
+          data-error={event.isError ? "true" : "false"}
+          className={cn(
+            "rounded-lg border px-3 py-2 shadow-sm",
+            event.isError
+              ? "border-error/40 bg-error/10"
+              : "border-success/40 bg-success/10"
+          )}
+        >
+          <div
+            className={cn(
+              "flex items-center gap-2 text-[10px] uppercase tracking-wider",
+              event.isError ? "text-error" : "text-success"
+            )}
+          >
+            {event.isError ? (
+              <AlertOctagon className="h-3 w-3" />
+            ) : (
+              <Wrench className="h-3 w-3" />
+            )}
+            <span>{event.isError ? "Tool error" : "Tool result"}</span>
+            <span className="ml-auto font-mono text-muted">{formatTs(event.ts)}</span>
+          </div>
+          <pre
+            className={cn(
+              "mt-1 max-h-48 overflow-auto rounded bg-background px-2 py-1 font-mono text-xs whitespace-pre-wrap",
+              event.isError ? "text-error" : "text-foreground"
+            )}
+          >
+            {event.output}
+          </pre>
+        </div>
+      );
+
+    case "agent.thread_message_received":
+      return (
+        <div
+          data-testid="agent-event-thread-message"
+          className="flex items-start gap-2 rounded-lg border border-border bg-surface px-3 py-2 shadow-sm"
+        >
+          <span className="inline-flex shrink-0 items-center gap-1 rounded-full bg-accent/15 px-2 py-0.5 text-[10px] font-medium text-accent">
+            <Activity className="h-3 w-3" />
+            from {event.fromAgentId}
+          </span>
+          <p className="text-xs text-muted whitespace-pre-wrap">{event.preview}</p>
+          <span className="ml-auto font-mono text-[10px] text-muted">{formatTs(event.ts)}</span>
+        </div>
+      );
+
+    case "session.status_running":
+    case "session.status_idle":
+      return (
+        <div
+          data-testid="agent-event-session-status"
+          className="flex items-center gap-2 text-[11px] text-muted"
+        >
+          <span className="rounded-full border border-border bg-background px-2 py-0.5 font-medium">
+            {event.type === "session.status_running" ? "Session running" : "Session idle"}
+            {event.stopReason ? ` (${event.stopReason})` : ""}
+          </span>
+          <span className="font-mono text-[10px]">{formatTs(event.ts)}</span>
+        </div>
+      );
+
+    case "session.error":
+      return (
+        <div
+          data-testid="agent-event-session-error"
+          className="rounded-lg border border-error/40 bg-error/10 px-3 py-2 text-xs text-error"
+        >
+          <div className="flex items-center gap-2 text-[10px] uppercase tracking-wider">
+            <AlertOctagon className="h-3 w-3" />
+            <span>Session error</span>
+            <span className="ml-auto font-mono text-muted">{formatTs(event.ts)}</span>
+          </div>
+          <p className="mt-1 font-mono">{event.error}</p>
+        </div>
+      );
+  }
+}
diff --git a/dashboard/src/components/agents/AgentEventStream.tsx b/dashboard/src/components/agents/AgentEventStream.tsx
new file mode 100644
index 00000000..bea68145
--- /dev/null
+++ b/dashboard/src/components/agents/AgentEventStream.tsx
@@ -0,0 +1,200 @@
+import { useEffect, useMemo, useRef, useState } from "react";
+import { Activity, Brain, ChevronRight, AlertOctagon } from "lucide-react";
+import { useAgentEvents } from "@/hooks/useAgentEvents";
+import { AgentEventCard } from "@/components/agents/AgentEventCard";
+import { cn } from "@/lib/utils";
+import type { AgentEvent } from "@/types/agentEvents";
+
+interface AgentEventStreamProps {
+  runId: string;
+}
+
+interface ThreadGroup {
+  threadId: string | null;
+  events: AgentEvent[];
+}
+
+const NO_THREAD_KEY = "__main__";
+
+function groupByThread(events: AgentEvent[]): ThreadGroup[] {
+  const order: string[] = [];
+  const buckets = new Map<string, ThreadGroup>();
+  for (const ev of events) {
+    let key = NO_THREAD_KEY;
+    let threadId: string | null = null;
+    if ("threadId" in ev && ev.threadId) {
+      key = ev.threadId;
+      threadId = ev.threadId;
+    }
+    let bucket = buckets.get(key);
+    if (!bucket) {
+      bucket = { threadId, events: [] };
+      buckets.set(key, bucket);
+      order.push(key);
+    }
+    bucket.events.push(ev);
+  }
+  return order.map((k) => buckets.get(k) as ThreadGroup);
+}
+
+function deriveStatusLabel(
+  events: AgentEvent[],
+  status: ReturnType<typeof useAgentEvents>["status"]
+): { label: string; tone: "running" | "idle" | "error" | "unavailable" } {
+  if (status === "error") return { label: "Stream error", tone: "error" };
+  if (status === "unavailable") return { label: "Stream unavailable", tone: "unavailable" };
+  // Find most recent session status event to enrich the label.
+  let stopReason: string | undefined;
+  let lastSessionType: "session.status_running" | "session.status_idle" | null = null;
+  for (let i = events.length - 1; i >= 0; i--) {
+    const ev = events[i];
+    if (ev.type === "session.status_running" || ev.type === "session.status_idle") {
+      lastSessionType = ev.type;
+      stopReason = ev.type === "session.status_idle" ? ev.stopReason : undefined;
+      break;
+    }
+  }
+  if (status === "running" || lastSessionType === "session.status_running") {
+    return { label: "Session running", tone: "running" };
+  }
+  const suffix = stopReason ? ` (${stopReason})` : "";
+  return { label: `Session idle${suffix}`, tone: "idle" };
+}
+
+export function AgentEventStream({ runId }: AgentEventStreamProps) {
+  const { events, status, error } = useAgentEvents(runId);
+  const listRef = useRef<HTMLDivElement | null>(null);
+  const [collapsedThreads, setCollapsedThreads] = useState<Record<string, boolean>>({});
+
+  const groups = useMemo(() => groupByThread(events), [events]);
+  const multiThread = groups.filter((g) => g.threadId !== null).length > 1;
+  const statusInfo = useMemo(() => deriveStatusLabel(events, status), [events, status]);
+
+  useEffect(() => {
+    if (!listRef.current) return;
+    listRef.current.scrollTop = listRef.current.scrollHeight;
+  }, [events.length]);
+
+  if (status === "unavailable") {
+    return (
+      <div className="rounded-xl border border-border bg-surface p-5 text-sm text-muted shadow-sm">
+        <div className="mb-1 flex items-center gap-2 font-semibold text-foreground">
+          <Brain className="h-4 w-4 text-accent" />
+          Agent Reasoning
+        </div>
+        <p>Live agent stream is not available for this run.</p>
+      </div>
+    );
+  }
+
+  return (
+    <section
+      data-testid="agent-event-stream"
+      className="overflow-hidden rounded-xl border border-border bg-surface shadow-sm"
+    >
+      <header
+        className={cn(
+          "sticky top-0 z-10 flex items-center gap-2 border-b border-border bg-surface px-4 py-2.5"
+        )}
+      >
+        <Brain className="h-4 w-4 text-accent" />
+        <h3 className="text-sm font-semibold text-foreground">Agent Reasoning</h3>
+        <span
+          data-testid="agent-stream-status"
+          data-tone={statusInfo.tone}
+          className={cn(
+            "ml-auto inline-flex items-center gap-1 rounded-full px-2 py-0.5 text-[11px] font-medium",
+            statusInfo.tone === "running" &&
+              "bg-accent/15 text-accent border border-accent/30",
+            statusInfo.tone === "idle" &&
+              "bg-background text-muted border border-border",
+            statusInfo.tone === "error" &&
+              "bg-error/15 text-error border border-error/30",
+            statusInfo.tone === "unavailable" &&
+              "bg-background text-muted border border-border"
+          )}
+        >
+          <Activity className="h-3 w-3" />
+          {statusInfo.label}
+        </span>
+      </header>
+
+      {status === "error" && error && (
+        <div
+          data-testid="agent-stream-error-banner"
+          className="flex items-start gap-2 border-b border-error/30 bg-error/10 px-4 py-2 text-xs text-error"
+        >
+          <AlertOctagon className="mt-0.5 h-3.5 w-3.5 shrink-0" />
+          <span className="font-mono">{error}</span>
+        </div>
+      )}
+
+      <div
+        ref={listRef}
+        data-testid="agent-stream-list"
+        className="max-h-[28rem] overflow-y-auto px-4 py-3"
+      >
+        {events.length === 0 ? (
+          <div
+            data-testid="agent-stream-empty"
+            className="py-10 text-center text-xs text-muted"
+          >
+            No agent events yet. Reasoning, tool calls, and thread messages
+            will appear here as the session progresses.
+          </div>
+        ) : multiThread ? (
+          <div className="space-y-4">
+            {groups.map((group, idx) => {
+              const key = group.threadId ?? NO_THREAD_KEY;
+              const collapsed = !!collapsedThreads[key];
+              const label = group.threadId
+                ? `Thread ${group.threadId}`
+                : "Main session";
+              return (
+                <div
+                  key={`${key}-${idx}`}
+                  data-testid="agent-stream-thread"
+                  data-thread-id={group.threadId ?? ""}
+                  className="rounded-lg border border-border bg-background/40"
+                >
+                  <button
+                    type="button"
+                    onClick={() =>
+                      setCollapsedThreads((prev) => ({ ...prev, [key]: !prev[key] }))
+                    }
+                    aria-expanded={!collapsed}
+                    className="flex w-full items-center gap-2 px-3 py-2 text-left text-xs font-semibold text-muted hover:text-foreground transition-colors"
+                  >
+                    <ChevronRight
+                      className={cn(
+                        "h-3.5 w-3.5 transition-transform",
+                        !collapsed && "rotate-90"
+                      )}
+                    />
+                    <span>{label}</span>
+                    <span className="ml-auto rounded-full bg-surface px-1.5 py-0.5 text-[10px] font-mono text-muted-foreground">
+                      {group.events.length}
+                    </span>
+                  </button>
+                  {!collapsed && (
+                    <div className="space-y-2 border-t border-border px-3 py-2">
+                      {group.events.map((ev, i) => (
+                        <AgentEventCard key={`${key}-${i}`} event={ev} />
+                      ))}
+                    </div>
+                  )}
+                </div>
+              );
+            })}
+          </div>
+        ) : (
+          <div className="space-y-2">
+            {events.map((ev, i) => (
+              <AgentEventCard key={i} event={ev} />
+            ))}
+          </div>
+        )}
+      </div>
+    </section>
+  );
+}
diff --git a/dashboard/src/hooks/useAgentEvents.ts b/dashboard/src/hooks/useAgentEvents.ts
new file mode 100644
index 00000000..f77fcf06
--- /dev/null
+++ b/dashboard/src/hooks/useAgentEvents.ts
@@ -0,0 +1,117 @@
+import { useEffect, useRef, useState } from "react";
+import { api } from "@/api/client";
+import { API_BASE_URL } from "@/lib/constants";
+import type { AgentEvent, AgentStreamStatus } from "@/types/agentEvents";
+
+interface UseAgentEventsResult {
+  events: AgentEvent[];
+  status: AgentStreamStatus;
+  error: string | null;
+}
+
+/**
+ * Subscribes to /api/runs/{runId}/agent-stream via SSE and surfaces
+ * Anthropic agent events. Falls back gracefully to status "unavailable"
+ * when the backend endpoint is not yet wired up (e.g. 404).
+ *
+ * Uses fetch + ReadableStream so that auth headers can be passed (EventSource
+ * does not support custom headers). AbortController is used for cleanup.
+ */
+export function useAgentEvents(runId: string | null | undefined): UseAgentEventsResult {
+  const [events, setEvents] = useState<AgentEvent[]>([]);
+  const [status, setStatus] = useState<AgentStreamStatus>("idle");
+  const [error, setError] = useState<string | null>(null);
+  const abortRef = useRef<AbortController | null>(null);
+
+  useEffect(() => {
+    if (!runId) {
+      setStatus("idle");
+      setEvents([]);
+      setError(null);
+      return;
+    }
+
+    setEvents([]);
+    setError(null);
+    setStatus("connecting");
+
+    const controller = new AbortController();
+    abortRef.current = controller;
+
+    const url = `${API_BASE_URL}/runs/${runId}/agent-stream`;
+
+    (async () => {
+      try {
+        const res = await fetch(url, {
+          headers: api.authHeaders(),
+          signal: controller.signal,
+        });
+
+        if (res.status === 404) {
+          setStatus("unavailable");
+          return;
+        }
+
+        if (!res.ok || !res.body) {
+          setStatus("error");
+          setError(`Stream failed with status ${res.status}`);
+          return;
+        }
+
+        setStatus("running");
+        const reader = res.body.getReader();
+        const decoder = new TextDecoder();
+        let buffer = "";
+        let currentData = "";
+
+        while (true) {
+          const { done, value } = await reader.read();
+          if (done) break;
+
+          buffer += decoder.decode(value, { stream: true });
+          const lines = buffer.split("\n");
+          buffer = lines.pop() || "";
+
+          for (const line of lines) {
+            if (line.startsWith("data: ")) {
+              currentData += (currentData ? "\n" : "") + line.slice(6);
+            } else if (line.trim() === "" && currentData) {
+              try {
+                const parsed = JSON.parse(currentData) as AgentEvent;
+                setEvents((prev) => {
+                  const next = [...prev, parsed];
+                  if (next.length > 500) return next.slice(-500);
+                  return next;
+                });
+                if (parsed.type === "session.status_running") {
+                  setStatus("running");
+                } else if (parsed.type === "session.status_idle") {
+                  setStatus("idle");
+                } else if (parsed.type === "session.error") {
+                  setStatus("error");
+                  setError(parsed.error);
+                }
+              } catch {
+                // ignore malformed payloads
+              }
+              currentData = "";
+            }
+          }
+        }
+        // Stream ended cleanly - mirror the session status (idle if none seen).
+        setStatus((prev) => (prev === "running" ? "idle" : prev));
+      } catch (e: unknown) {
+        if (e instanceof DOMException && e.name === "AbortError") return;
+        setStatus("error");
+        setError(e instanceof Error ? e.message : "Stream connection failed");
+      }
+    })();
+
+    return () => {
+      controller.abort();
+      abortRef.current = null;
+    };
+  }, [runId]);
+
+  return { events, status, error };
+}
diff --git a/dashboard/src/pages/RunDetailPage.tsx b/dashboard/src/pages/RunDetailPage.tsx
index ad937640..7f716a97 100644
--- a/dashboard/src/pages/RunDetailPage.tsx
+++ b/dashboard/src/pages/RunDetailPage.tsx
@@ -22,6 +22,7 @@ import { CelebrationModal } from "@/components/shared/CelebrationModal";
 import { Breadcrumb } from "@/components/shared/Breadcrumb";
 import { Skeleton } from "@/components/ui/Skeleton";
 import { AiChatSidebar } from "@/components/runs/AiChatSidebar";
+import { AgentEventStream } from "@/components/agents/AgentEventStream";
 import { fireConfetti, playCelebrationSound } from "@/lib/confetti";
 import { detectAnomalies, detectRetryHeavy, type Anomaly } from "@/lib/anomalyDetection";
 import { formatDuration, formatCost, formatRelativeTime, parseUTC, cn, isSafeUrl } from "@/lib/utils";
@@ -742,6 +743,9 @@ export default function RunDetailPage() {
         />
       </div>
 
+      {/* Agent Reasoning panel - live SSE stream of Anthropic agent events */}
+      {id && <AgentEventStream runId={id} />}
+
       {/* Generated images gallery */}
       {(() => {
         const allImages = liveSteps.flatMap((s) => {
diff --git a/dashboard/src/types/agentEvents.ts b/dashboard/src/types/agentEvents.ts
new file mode 100644
index 00000000..4deb0afd
--- /dev/null
+++ b/dashboard/src/types/agentEvents.ts
@@ -0,0 +1,79 @@
+/**
+ * Anthropic managed-agent SSE event shapes surfaced to the dashboard.
+ *
+ * These mirror the events emitted by an Anthropic agent session:
+ *   - agent.message              assistant text turn
+ *   - agent.thinking             reasoning trace
+ *   - agent.tool_use             tool invocation with structured input
+ *   - agent.tool_result          tool output (success or error)
+ *   - agent.thread_message_received   multiagent thread fan-in
+ *   - session.status_running     session became active
+ *   - session.status_idle        session paused, optionally with a stop reason
+ *   - session.error              fatal session error
+ */
+export type AgentMessageEvent = {
+  type: "agent.message";
+  threadId?: string;
+  text: string;
+  ts: number;
+};
+
+export type AgentThinkingEvent = {
+  type: "agent.thinking";
+  threadId?: string;
+  text: string;
+  ts: number;
+};
+
+export type AgentToolUseEvent = {
+  type: "agent.tool_use";
+  threadId?: string;
+  toolName: string;
+  input: Record<string, unknown>;
+  toolUseId: string;
+  ts: number;
+};
+
+export type AgentToolResultEvent = {
+  type: "agent.tool_result";
+  toolUseId: string;
+  output: string;
+  isError: boolean;
+  ts: number;
+};
+
+export type AgentThreadMessageEvent = {
+  type: "agent.thread_message_received";
+  threadId: string;
+  fromAgentId: string;
+  preview: string;
+  ts: number;
+};
+
+export type SessionStatusEvent = {
+  type: "session.status_running" | "session.status_idle";
+  ts: number;
+  stopReason?: string;
+};
+
+export type SessionErrorEvent = {
+  type: "session.error";
+  error: string;
+  ts: number;
+};
+
+export type AgentEvent =
+  | AgentMessageEvent
+  | AgentThinkingEvent
+  | AgentToolUseEvent
+  | AgentToolResultEvent
+  | AgentThreadMessageEvent
+  | SessionStatusEvent
+  | SessionErrorEvent;
+
+export type AgentStreamStatus =
+  | "idle"
+  | "connecting"
+  | "running"
+  | "error"
+  | "unavailable";
diff --git a/docs/tool-examples-convention.md b/docs/tool-examples-convention.md
new file mode 100644
index 00000000..9c4481ba
--- /dev/null
+++ b/docs/tool-examples-convention.md
@@ -0,0 +1,114 @@
+# Tool Examples Convention
+
+This document is the contract for connector authors registering tools with the
+Sandcastle agent runtime. It lives next to the `tool_search` module and the
+linter that enforces it.
+
+## Why examples matter
+
+Two numbers drove this convention:
+
+- **Tool selection accuracy: 49 percent -> 74 percent** once tool search was
+  added so the agent only sees a relevant subset.
+- **Parameter-shape accuracy: 72 percent -> 90 percent** once each tool
+  carried 1 to 5 worked examples.
+
+With 62 connectors and several tools each, Sandcastle is in the regime where
+both effects compound. Skipping examples is the single fastest way to make
+your connector look broken in evals.
+
+## What every tool must declare
+
+Every `ToolDefinition` registered with `default_registry` must satisfy
+`validate_tool`:
+
+1. **`name`** - stable, snake_case, unique inside the registry.
+2. **`description`** - at least 20 characters. Lead with the verb. State what
+   the tool does, not how. Mention the most useful inputs.
+3. **`parameters`** - a JSON Schema (Draft 2020-12) for the input object.
+4. **`examples`** - between 1 and 5 entries. Each entry is a dict shaped
+   `{"input": {...}, "output": {...}}`. The `input` is validated against
+   `parameters` at registration time; the `output` is illustrative and is not
+   schema-checked but must be a dict so it serialises cleanly.
+5. **`tags`** - optional but encouraged. The search ranker scores tag hits
+   3x higher than description hits.
+6. **`defer_loading`** - default `False`. Set to `True` only for rare,
+   expensive, or domain-specialised tools that should not occupy the eager
+   prompt budget. The agent reaches lazy tools via explicit search.
+
+## When to defer loading
+
+Reach for `defer_loading=True` when **any** of the following hold:
+
+- The tool is used in fewer than 1 percent of runs across the connector.
+- The tool wraps an expensive remote operation that requires careful framing.
+- The tool is part of a specialised pack (forensics, legacy migration, niche
+  protocol) that most agents should never see.
+
+If in doubt, leave it eager. Hot tools are cheap; missed selections are not.
+
+## Example YAML connector definition
+
+```yaml
+name: pdf
+version: 1.0.0
+tools:
+  - name: pdf_extract_text
+    description: Extract plain text from a PDF, preserving reading order.
+    tags: [pdf, ocr, document]
+    parameters:
+      type: object
+      properties:
+        path:
+          type: string
+          description: Local filesystem path to the PDF.
+        pages:
+          type: string
+          description: Optional page range like "1-5" or "3,7".
+      required: [path]
+    examples:
+      - input: {path: "/tmp/contract.pdf"}
+        output: {text: "Master Services Agreement ...", pages: 12}
+      - input: {path: "/tmp/report.pdf", pages: "1-2"}
+        output: {text: "Executive Summary ...", pages: 2}
+
+  - name: pdf_redact_pii
+    description: Redact PII spans from a PDF and return a clean copy path.
+    tags: [pdf, privacy, redaction]
+    defer_loading: true
+    parameters:
+      type: object
+      properties:
+        path: {type: string}
+        modes:
+          type: array
+          items: {type: string, enum: [email, phone, ssn, name]}
+      required: [path]
+    examples:
+      - input: {path: "/tmp/case.pdf", modes: ["email", "phone"]}
+        output: {redacted_path: "/tmp/case.redacted.pdf", spans: 7}
+```
+
+## Linter command
+
+Run the linter before publishing a connector. It calls `validate_tool` on
+every entry and prints aggregated errors:
+
+```bash
+sandcastle tools validate
+```
+
+A passing run prints `OK` and exits 0. Any tool with errors fails the
+command, blocking publish.
+
+## Search and formatting at runtime
+
+The agent runtime uses three calls on `default_registry`:
+
+- `hot_tools()` for the eager system prompt.
+- `search(query, limit=5)` when the agent asks for "more tools like X".
+- `format_for_agent(tools)` to emit the Anthropic-compatible shape
+  `{name, description, input_schema, examples?}`.
+
+Authors do not need to call these directly; just register a well-formed
+`ToolDefinition` and the runtime takes care of the rest.
diff --git a/src/sandcastle/__main__.py b/src/sandcastle/__main__.py
index e9baf8c1..395da33a 100644
--- a/src/sandcastle/__main__.py
+++ b/src/sandcastle/__main__.py
@@ -1285,6 +1285,54 @@ def _cmd_publish_mcp(args: argparse.Namespace) -> None:
     )
 
 
+def _cmd_publish_skills(args: argparse.Namespace) -> None:
+    """Publish workflows as Anthropic Skills.
+
+    Without ``--upload``: dry-run, prints a JSON list of workflows that would
+    be published (one entry per .yaml/.yml file under the workflows dir).
+    With ``--upload``: invokes ``publish_workflows_as_skills(dry_run=False)``
+    so each skill is POSTed via :class:`AnthropicSkillsClient`. ``--dir``
+    overrides ``settings.workflows_dir``.
+    """
+    import asyncio
+
+    from sandcastle.config import settings
+    from sandcastle.engine.agent_skills import (
+        AnthropicSkillsClient,
+        SkillValidationError,
+        publish_workflows_as_skills,
+    )
+
+    workflow_dir = getattr(args, "dir", None) or settings.workflows_dir
+    upload = bool(getattr(args, "upload", False))
+
+    async def _run() -> list[dict]:
+        client: AnthropicSkillsClient | None = None
+        if upload:
+            api_key = os.environ.get("ANTHROPIC_API_KEY", "")
+            if not api_key:
+                raise RuntimeError(
+                    "ANTHROPIC_API_KEY is required for --upload"
+                )
+            client = AnthropicSkillsClient(api_key=api_key)
+        return await publish_workflows_as_skills(
+            workflow_dir=workflow_dir,
+            dry_run=not upload,
+            client=client,
+        )
+
+    try:
+        results = asyncio.run(_run())
+    except SkillValidationError as exc:
+        print(f"Error: {exc}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as exc:
+        print(f"Error: {exc}", file=sys.stderr)
+        sys.exit(1)
+
+    print(json.dumps(results, indent=2, default=str))
+
+
 def _cmd_doctor(args: argparse.Namespace) -> None:
     """Run local diagnostics - no running server needed."""
     import importlib
@@ -4415,6 +4463,23 @@ def _build_parser() -> argparse.ArgumentParser:
         help="Workflow name (omit to list all publishable workflows as JSON)",
     )
 
+    # --- publish-skills ---
+    p_publish_skills = subparsers.add_parser(
+        "publish-skills",
+        help="Publish workflows as Anthropic Skills (dry-run by default)",
+    )
+    p_publish_skills.add_argument(
+        "--upload",
+        action="store_true",
+        default=False,
+        help="Actually POST each skill via AnthropicSkillsClient (default: dry-run)",
+    )
+    p_publish_skills.add_argument(
+        "--dir",
+        default=None,
+        help="Override the workflows directory (defaults to settings.workflows_dir)",
+    )
+
     # --- doctor ---
     p_doctor = subparsers.add_parser("doctor", help="Run local diagnostics")
     p_doctor.add_argument("workflow", nargs="?", default=None, help="Workflow YAML file to diagnose (optional)")
@@ -4744,6 +4809,7 @@ def main() -> None:
         "health": _cmd_health,
         "mcp": _cmd_mcp,
         "publish-mcp": _cmd_publish_mcp,
+        "publish-skills": _cmd_publish_skills,
         "doctor": _cmd_doctor,
         "generate": _cmd_generate,
         "eval": _cmd_eval,
diff --git a/src/sandcastle/api/agent_webhooks.py b/src/sandcastle/api/agent_webhooks.py
new file mode 100644
index 00000000..8d374662
--- /dev/null
+++ b/src/sandcastle/api/agent_webhooks.py
@@ -0,0 +1,296 @@
+"""Webhooks subscriber and handler for Anthropic Managed Agent lifecycle events.
+
+This module exposes a FastAPI router that receives webhook callbacks from
+Anthropic's Managed Agents API (beta header `managed-agents-2026-04-01`) and
+dispatches them to registered async handlers. It also provides an
+`AnthropicWebhookSubscription` client for managing subscriptions remotely.
+
+Event types handled:
+  - session.status_idle
+  - session.status_running
+  - session.status_rescheduled
+  - session.status_terminated
+  - session.error
+  - vault.credential_refreshed
+
+The router ACKs within Anthropic's webhook timeout by firing handlers as
+asyncio tasks rather than awaiting them inline.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import hmac
+import json
+import logging
+import os
+from collections.abc import Awaitable, Callable
+from typing import Any
+
+import httpx
+from fastapi import APIRouter, Header, HTTPException, Request, status
+
+from sandcastle.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+SUPPORTED_EVENTS: tuple[str, ...] = (
+    "session.status_idle",
+    "session.status_running",
+    "session.status_rescheduled",
+    "session.status_terminated",
+    "session.error",
+    "vault.credential_refreshed",
+)
+
+# Anthropic Managed Agents beta header (May 6 2026 webhooks beta).
+ANTHROPIC_BETA_HEADER = "managed-agents-2026-04-01"
+
+# Endpoint path for webhook subscription management. The public path was not
+# finalised at module-authoring time; we default to /v1/webhooks and allow
+# callers to override via the `endpoint` constructor argument.
+DEFAULT_SUBSCRIPTIONS_ENDPOINT = "/v1/webhooks"
+
+
+HandlerFn = Callable[[dict[str, Any]], Awaitable[None]]
+
+# Registry of handlers keyed by event type. External modules append via
+# `register_handler` (decorator or direct call).
+AGENT_WEBHOOK_HANDLERS: dict[str, list[HandlerFn]] = {
+    event: [] for event in SUPPORTED_EVENTS
+}
+
+
+class WebhookVerifyError(Exception):
+    """Raised when a webhook HMAC signature cannot be verified."""
+
+
+def register_handler(
+    event_type: str, handler: HandlerFn | None = None
+) -> HandlerFn | Callable[[HandlerFn], HandlerFn]:
+    """Register an async handler for a webhook event type.
+
+    Usable both as decorator and as direct call:
+
+        @register_handler("session.status_idle")
+        async def my_handler(event): ...
+
+        register_handler("session.error", my_handler)
+    """
+
+    if event_type not in AGENT_WEBHOOK_HANDLERS:
+        AGENT_WEBHOOK_HANDLERS[event_type] = []
+
+    def _add(fn: HandlerFn) -> HandlerFn:
+        AGENT_WEBHOOK_HANDLERS[event_type].append(fn)
+        return fn
+
+    if handler is not None:
+        return _add(handler)
+    return _add
+
+
+def verify_signature(secret: str, raw_body: bytes, signature_header: str) -> bool:
+    """Verify an HMAC-SHA256 signature header against the raw request body.
+
+    Accepts the digest either as a bare hex string or prefixed with `sha256=`.
+    """
+
+    if not signature_header:
+        return False
+
+    provided = signature_header.strip()
+    if provided.startswith("sha256="):
+        provided = provided[len("sha256=") :]
+
+    expected = hmac.new(
+        secret.encode("utf-8"), raw_body, hashlib.sha256
+    ).hexdigest()
+    return hmac.compare_digest(expected, provided)
+
+
+router = APIRouter(prefix="/agent-webhooks", tags=["agent-webhooks"])
+
+
+async def _dispatch(event_type: str, event: dict[str, Any]) -> None:
+    """Run all registered handlers for `event_type` concurrently."""
+
+    handlers = AGENT_WEBHOOK_HANDLERS.get(event_type, [])
+    if not handlers:
+        logger.debug("No handlers registered for event %s", event_type)
+        return
+
+    async def _safe_run(fn: HandlerFn) -> None:
+        try:
+            await fn(event)
+        except Exception:  # noqa: BLE001 - we never want a handler to crash dispatch
+            logger.exception("Webhook handler %s failed for %s", fn, event_type)
+
+    await asyncio.gather(*(_safe_run(fn) for fn in handlers))
+
+
+@router.post("/anthropic")
+async def receive_anthropic_webhook(
+    request: Request,
+    x_anthropic_signature: str | None = Header(default=None),
+) -> dict[str, Any]:
+    """Receive a webhook from Anthropic Managed Agents.
+
+    Verifies HMAC-SHA256 signature when `ANTHROPIC_WEBHOOK_SECRET` is set, or
+    when running in non-local mode (in which case absence of the secret is a
+    misconfiguration and yields 401). Dispatches in the background and returns
+    immediately so Anthropic's ACK timeout is not exceeded.
+    """
+
+    raw_body = await request.body()
+    secret = os.getenv("ANTHROPIC_WEBHOOK_SECRET")
+
+    if secret:
+        if not verify_signature(secret, raw_body, x_anthropic_signature or ""):
+            logger.warning("Rejected Anthropic webhook with bad signature")
+            raise HTTPException(
+                status_code=status.HTTP_401_UNAUTHORIZED,
+                detail="invalid signature",
+            )
+    else:
+        if settings.is_local_mode:
+            logger.warning(
+                "ANTHROPIC_WEBHOOK_SECRET not set; skipping signature verify "
+                "(local mode only)"
+            )
+        else:
+            logger.error(
+                "ANTHROPIC_WEBHOOK_SECRET not set in non-local mode; rejecting"
+            )
+            raise HTTPException(
+                status_code=status.HTTP_401_UNAUTHORIZED,
+                detail="webhook secret not configured",
+            )
+
+    try:
+        payload = json.loads(raw_body.decode("utf-8") or "{}")
+    except json.JSONDecodeError as exc:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"invalid json: {exc}",
+        ) from exc
+
+    event_type = payload.get("type") or payload.get("event_type")
+    if not event_type:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="missing event type",
+        )
+
+    if event_type not in SUPPORTED_EVENTS:
+        logger.info("Ignoring unsupported Anthropic webhook event %s", event_type)
+        return {"status": "ignored", "event_type": event_type}
+
+    # Fire-and-forget dispatch so we ACK Anthropic within their timeout.
+    asyncio.create_task(_dispatch(event_type, payload))
+
+    return {"status": "accepted", "event_type": event_type}
+
+
+class AnthropicWebhookSubscription:
+    """Client for managing Anthropic Managed Agents webhook subscriptions.
+
+    The public endpoint for webhook subscription management was not finalised
+    at module-authoring time. We default to `/v1/webhooks`; pass `endpoint=`
+    to override once Anthropic publishes the final path.
+    """
+
+    def __init__(
+        self,
+        api_key: str | None = None,
+        base_url: str = "https://api.anthropic.com",
+        endpoint: str = DEFAULT_SUBSCRIPTIONS_ENDPOINT,
+        client: httpx.AsyncClient | None = None,
+        timeout: float = 30.0,
+    ) -> None:
+        self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY", "")
+        self.base_url = base_url.rstrip("/")
+        self.endpoint = endpoint if endpoint.startswith("/") else f"/{endpoint}"
+        self._client = client
+        self._timeout = timeout
+
+    def _headers(self) -> dict[str, str]:
+        return {
+            "x-api-key": self.api_key,
+            "anthropic-version": "2023-06-01",
+            "anthropic-beta": ANTHROPIC_BETA_HEADER,
+            "content-type": "application/json",
+        }
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        if self._client is not None:
+            return self._client
+        return httpx.AsyncClient(base_url=self.base_url, timeout=self._timeout)
+
+    async def create_subscription(
+        self,
+        callback_url: str,
+        events: list[str],
+        secret: str | None = None,
+    ) -> dict[str, Any]:
+        """Create a new webhook subscription. Returns the subscription record."""
+
+        body: dict[str, Any] = {"url": callback_url, "events": events}
+        if secret:
+            body["secret"] = secret
+
+        client = await self._get_client()
+        owns_client = self._client is None
+        try:
+            resp = await client.post(
+                self.endpoint, json=body, headers=self._headers()
+            )
+            resp.raise_for_status()
+            return resp.json()
+        finally:
+            if owns_client:
+                await client.aclose()
+
+    async def list_subscriptions(self) -> list[dict[str, Any]]:
+        """List existing webhook subscriptions for this API key."""
+
+        client = await self._get_client()
+        owns_client = self._client is None
+        try:
+            resp = await client.get(self.endpoint, headers=self._headers())
+            resp.raise_for_status()
+            data = resp.json()
+            if isinstance(data, dict):
+                return list(data.get("data", []) or data.get("subscriptions", []))
+            return list(data)
+        finally:
+            if owns_client:
+                await client.aclose()
+
+    async def delete_subscription(self, sub_id: str) -> None:
+        """Delete the named subscription."""
+
+        client = await self._get_client()
+        owns_client = self._client is None
+        try:
+            resp = await client.delete(
+                f"{self.endpoint}/{sub_id}", headers=self._headers()
+            )
+            resp.raise_for_status()
+        finally:
+            if owns_client:
+                await client.aclose()
+
+
+__all__ = [
+    "ANTHROPIC_BETA_HEADER",
+    "AGENT_WEBHOOK_HANDLERS",
+    "AnthropicWebhookSubscription",
+    "SUPPORTED_EVENTS",
+    "WebhookVerifyError",
+    "register_handler",
+    "router",
+    "verify_signature",
+]
diff --git a/src/sandcastle/engine/agent_runtime.py b/src/sandcastle/engine/agent_runtime.py
index eeb6a714..f31e5132 100644
--- a/src/sandcastle/engine/agent_runtime.py
+++ b/src/sandcastle/engine/agent_runtime.py
@@ -189,6 +189,61 @@ async def execute(
                         logger.debug("Failed to clean up session %s", session_id)
 
 
+class AgentSDKRuntimeAdapter(AgentRuntime):
+    """Bridge from the AgentRuntime ABC to AgentSDKRunner.
+
+    Routes ``runtime: "agent-sdk"`` to the in-process Claude Agent SDK loop
+    defined in :mod:`sandcastle.engine.agent_sdk_runtime`. Lets operators
+    keep workflow traffic on their own infra (only the model API call
+    leaves) while reusing the rest of Sandcastle's executor pipeline.
+    """
+
+    name = "agent-sdk"
+
+    async def is_available(self) -> bool:
+        from sandcastle.engine.agent_sdk_runtime import is_available as sdk_available
+
+        return sdk_available()
+
+    async def execute(
+        self,
+        system_prompt: str,
+        tools: list[str],
+        packages: list[str],
+        message: str,
+        model: str,
+        timeout: int,
+        network: str,
+    ) -> dict:
+        from sandcastle.engine.agent_sdk_runtime import (
+            AgentSDKConfig,
+            AgentSDKRunner,
+        )
+
+        # ``tools`` arrives as a list of bare tool names; the SDK expects a
+        # list of dicts. We let the runner / SDK reject unknown shapes.
+        sdk_tools: list[dict] = [
+            {"name": t} if isinstance(t, str) else t for t in (tools or [])
+        ]
+        config = AgentSDKConfig(
+            model=model,
+            system_prompt=system_prompt or None,
+            tools=sdk_tools,
+            timeout_seconds=timeout,
+        )
+        runner = AgentSDKRunner()
+        result = await runner.run(message, config)
+        return {
+            "output": result.output,
+            "tokens_in": 0,
+            "tokens_out": 0,
+            "runtime": "agent-sdk",
+            "cost_usd": result.cost_usd,
+            "duration_ms": result.duration_ms,
+            "error": result.error,
+        }
+
+
 class LocalRuntime(AgentRuntime):
     """Local agent execution via Ollama - no cloud, no cost."""
 
@@ -294,6 +349,7 @@ async def execute(
     "auto": AutoRuntime(),
     "anthropic": AnthropicRuntime(),
     "local": LocalRuntime(),
+    "agent-sdk": AgentSDKRuntimeAdapter(),
 }
 
 
diff --git a/src/sandcastle/engine/agent_sdk_runtime.py b/src/sandcastle/engine/agent_sdk_runtime.py
new file mode 100644
index 00000000..177fdb98
--- /dev/null
+++ b/src/sandcastle/engine/agent_sdk_runtime.py
@@ -0,0 +1,264 @@
+"""Claude Agent SDK alternative runtime for Sandcastle.
+
+This runtime executes the agent loop IN-PROCESS using the open-source
+``claude-agent-sdk`` package, instead of delegating to Anthropic's managed
+agent infrastructure (which is what the existing ``runtime: "anthropic"``
+backend does in ``agent_runtime.py``).
+
+When to choose which runtime
+----------------------------
+
+Managed Agents runtime (``runtime: "anthropic"``):
+    - Zero infra to operate, Anthropic manages the agent loop, tool execution,
+      retries, and timeouts on their side.
+    - Lower latency for cold starts (warm pool on Anthropic side).
+    - Pay per token + managed surcharge, but no compute on your side.
+    - Less flexible: you cannot inject custom skills, slash commands, or
+      attach a custom local filesystem as the agent's working dir.
+
+Agent SDK runtime (``runtime: "agent-sdk"``):
+    - Runs the agent loop locally, in the Sandcastle worker process.
+    - Required for air-gapped / EU sovereignty deployments where workflow
+      traffic must stay on the operator's infra (only the model API call
+      leaves, and even that can be pointed at an EU endpoint or a self-hosted
+      proxy).
+    - Full access to Claude Code style features: ``.claude/skills/`` directory,
+      slash commands in ``.claude/commands/``, hooks, custom MCP servers
+      bound to local filesystems.
+    - You pay for the compute that runs the loop (CPU + memory in your
+      worker), in addition to the token cost.
+    - No dependency on Anthropic managed agent infrastructure availability.
+
+The SDK is imported lazily inside functions so that ``import
+sandcastle.engine.agent_sdk_runtime`` keeps working even when
+``claude-agent-sdk`` is not installed. Operators opt in with::
+
+    pip install claude-agent-sdk
+
+We intentionally do not list ``claude-agent-sdk`` in Sandcastle's project
+dependencies. It stays an optional install.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import Any, Literal
+
+logger = logging.getLogger(__name__)
+
+
+# Permission modes accepted by the runtime.
+PermissionMode = Literal["auto", "prompt", "read_only"]
+
+
+class AgentSDKNotInstalled(Exception):
+    """Raised when ``claude-agent-sdk`` is not importable but a run was requested."""
+
+
+class AgentSDKConfigError(Exception):
+    """Raised when an :class:`AgentSDKConfig` fails validation."""
+
+
+@dataclass
+class AgentSDKConfig:
+    """Configuration for an Agent SDK run.
+
+    Attributes mirror the Claude Agent SDK surface without leaking SDK types,
+    so callers can construct configs even when the SDK is not installed.
+    """
+
+    model: str = "claude-sonnet-4-6"
+    system_prompt: str | None = None
+    tools: list[dict] = field(default_factory=list)
+    mcp_servers: dict[str, str] = field(default_factory=dict)
+    permission_mode: PermissionMode = "prompt"
+    skills_dir: str | None = None
+    commands_dir: str | None = None
+    working_dir: str | None = None
+    max_turns: int = 50
+    timeout_seconds: int = 600
+
+
+@dataclass
+class AgentSDKResult:
+    """Result of a single Agent SDK run."""
+
+    output: str
+    tool_calls: list[dict] = field(default_factory=list)
+    cost_usd: float = 0.0
+    duration_ms: int = 0
+    transcript_path: str | None = None
+    error: str | None = None
+
+
+def _try_import_sdk() -> tuple[Any, Any] | None:
+    """Attempt to import the SDK lazily.
+
+    Returns a tuple ``(ClaudeAgent, AgentDefinition)`` if importable, else
+    ``None``. Kept as a function (not a module-level import) so that this
+    module is safe to import without the SDK installed.
+    """
+
+    try:
+        from claude_agent_sdk import (  # type: ignore[import-not-found]
+            AgentDefinition,
+            ClaudeAgent,
+        )
+    except ImportError:
+        return None
+    return ClaudeAgent, AgentDefinition
+
+
+def is_available() -> bool:
+    """Return True if the ``claude-agent-sdk`` package is importable."""
+
+    return _try_import_sdk() is not None
+
+
+def validate_config(config: AgentSDKConfig) -> list[str]:
+    """Validate a config and return a list of human-readable error strings.
+
+    An empty list means the config is acceptable. The function never raises;
+    callers decide how to react.
+    """
+
+    errors: list[str] = []
+
+    if not isinstance(config.max_turns, int) or config.max_turns <= 0:
+        errors.append("max_turns must be a positive integer")
+
+    if not isinstance(config.timeout_seconds, int) or config.timeout_seconds <= 0:
+        errors.append("timeout_seconds must be a positive integer")
+
+    if config.permission_mode not in ("auto", "prompt", "read_only"):
+        errors.append(
+            f"permission_mode must be one of auto, prompt, read_only "
+            f"(got {config.permission_mode!r})"
+        )
+
+    if not isinstance(config.tools, list):
+        errors.append("tools must be a list of tool definition dicts")
+
+    if not isinstance(config.mcp_servers, dict):
+        errors.append("mcp_servers must be a dict of name to URL or command")
+
+    # If any MCP server URL uses a skills:// scheme but no skills_dir is set,
+    # the SDK has nothing to resolve against.
+    for name, target in (config.mcp_servers or {}).items():
+        if isinstance(target, str) and target.startswith("skills://") and not config.skills_dir:
+            errors.append(
+                f"mcp_servers[{name!r}] uses skills:// scheme but skills_dir is not set"
+            )
+
+    return errors
+
+
+class AgentSDKRunner:
+    """Runs prompts through the Claude Agent SDK in-process."""
+
+    name: str = "agent-sdk"
+
+    async def run(self, prompt: str, config: AgentSDKConfig) -> AgentSDKResult:
+        """Execute ``prompt`` against the SDK with ``config``.
+
+        Raises:
+            AgentSDKNotInstalled: if ``claude-agent-sdk`` is not importable.
+            AgentSDKConfigError: if ``config`` fails validation.
+        """
+
+        errors = validate_config(config)
+        if errors:
+            raise AgentSDKConfigError("; ".join(errors))
+
+        imported = _try_import_sdk()
+        if imported is None:
+            raise AgentSDKNotInstalled(
+                "claude-agent-sdk is not installed. Install with: "
+                "pip install claude-agent-sdk"
+            )
+
+        ClaudeAgent, AgentDefinition = imported
+
+        definition = AgentDefinition(
+            model=config.model,
+            system_prompt=config.system_prompt,
+            tools=config.tools,
+            mcp_servers=config.mcp_servers,
+            permission_mode=config.permission_mode,
+            skills_dir=config.skills_dir,
+            commands_dir=config.commands_dir,
+            working_dir=config.working_dir,
+            max_turns=config.max_turns,
+        )
+
+        agent = ClaudeAgent(definition=definition)
+
+        started = time.monotonic()
+        try:
+            response = await asyncio.wait_for(
+                agent.run(prompt),
+                timeout=config.timeout_seconds,
+            )
+        except TimeoutError:
+            duration_ms = int((time.monotonic() - started) * 1000)
+            return AgentSDKResult(
+                output="",
+                tool_calls=[],
+                cost_usd=0.0,
+                duration_ms=duration_ms,
+                transcript_path=None,
+                error=f"timeout after {config.timeout_seconds}s",
+            )
+        except Exception as exc:  # noqa: BLE001 - surface to caller as result.error
+            duration_ms = int((time.monotonic() - started) * 1000)
+            logger.exception("Agent SDK run failed")
+            return AgentSDKResult(
+                output="",
+                tool_calls=[],
+                cost_usd=0.0,
+                duration_ms=duration_ms,
+                transcript_path=None,
+                error=str(exc),
+            )
+
+        duration_ms = int((time.monotonic() - started) * 1000)
+        return _parse_response(response, duration_ms=duration_ms)
+
+
+def _parse_response(response: Any, *, duration_ms: int) -> AgentSDKResult:
+    """Translate the SDK response object into an :class:`AgentSDKResult`.
+
+    The SDK exposes attributes like ``output``, ``tool_calls``, ``cost_usd``,
+    and ``transcript_path`` on its response. We read defensively via
+    ``getattr`` so a minor SDK shape change does not crash the runtime.
+    """
+
+    output = getattr(response, "output", "") or ""
+    tool_calls = list(getattr(response, "tool_calls", []) or [])
+    cost_usd = float(getattr(response, "cost_usd", 0.0) or 0.0)
+    transcript_path = getattr(response, "transcript_path", None)
+    error = getattr(response, "error", None)
+
+    return AgentSDKResult(
+        output=output,
+        tool_calls=tool_calls,
+        cost_usd=cost_usd,
+        duration_ms=duration_ms,
+        transcript_path=transcript_path,
+        error=error,
+    )
+
+
+__all__ = [
+    "AgentSDKConfig",
+    "AgentSDKConfigError",
+    "AgentSDKNotInstalled",
+    "AgentSDKResult",
+    "AgentSDKRunner",
+    "PermissionMode",
+    "is_available",
+    "validate_config",
+]
diff --git a/src/sandcastle/engine/agent_skills.py b/src/sandcastle/engine/agent_skills.py
new file mode 100644
index 00000000..5dba8771
--- /dev/null
+++ b/src/sandcastle/engine/agent_skills.py
@@ -0,0 +1,483 @@
+"""Anthropic Agent Skills publisher for Sandcastle.
+
+Converts Sandcastle workflow YAMLs into uploadable Skill packages following the
+Anthropic Agent Skills spec (beta header `skills-2025-10-02`). A Skill is a
+tar.gz archive containing a SKILL.md (YAML frontmatter + markdown body) and any
+bundled support files. Frontmatter is loaded at agent init; the body is loaded
+on trigger; bundled files are loaded on demand (progressive disclosure).
+
+Public surface:
+    - SkillFrontmatter, SkillPackage dataclasses
+    - SkillValidationError
+    - workflow_to_skill(workflow_yaml) -> SkillPackage
+    - serialize_skill(package) -> bytes
+    - parse_skill(blob) -> SkillPackage
+    - AnthropicSkillsClient (async)
+    - publish_workflows_as_skills(workflow_dir, dry_run=True)
+"""
+
+from __future__ import annotations
+
+import io
+import json
+import re
+import tarfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+import httpx
+import yaml
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+DEFAULT_BETA_HEADERS = [
+    "skills-2025-10-02",
+    "code-execution-2025-10-02",
+    "files-api-2025-04-14",
+]
+
+# Anthropic Skills spec limits
+_NAME_MAX = 64
+_DESC_MAX = 1024
+
+# Names with these tokens are reserved per the spec.
+_RESERVED_TOKENS = ("anthropic", "claude")
+
+_NAME_PATTERN = re.compile(r"^[a-z0-9]+(?:-[a-z0-9]+)*$")
+
+
+# ---------------------------------------------------------------------------
+# Errors
+# ---------------------------------------------------------------------------
+
+
+class SkillValidationError(Exception):
+    """Raised when a Skill frontmatter or package fails validation."""
+
+
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class SkillFrontmatter:
+    """YAML frontmatter that appears at the top of every SKILL.md.
+
+    The frontmatter is always loaded into the agent's context, so it must stay
+    compact. The full body is loaded only when the Skill is triggered.
+    """
+
+    name: str
+    description: str
+    version: str = "1.0.0"
+    model: str | None = None
+    allowed_tools: list[str] | None = None
+
+    def __post_init__(self) -> None:
+        self._validate()
+
+    def _validate(self) -> None:
+        if not isinstance(self.name, str) or not self.name:
+            raise SkillValidationError("name must be a non-empty string")
+        if len(self.name) > _NAME_MAX:
+            raise SkillValidationError(
+                f"name must be <= {_NAME_MAX} chars, got {len(self.name)}"
+            )
+        if not _NAME_PATTERN.match(self.name):
+            raise SkillValidationError(
+                "name must be lowercase alphanumeric with single hyphens"
+            )
+        lowered = self.name.lower()
+        for token in _RESERVED_TOKENS:
+            if token in lowered:
+                raise SkillValidationError(
+                    f"name must not contain reserved token '{token}'"
+                )
+
+        if not isinstance(self.description, str) or not self.description:
+            raise SkillValidationError("description must be a non-empty string")
+        if len(self.description) > _DESC_MAX:
+            raise SkillValidationError(
+                f"description must be <= {_DESC_MAX} chars, got {len(self.description)}"
+            )
+
+        if self.allowed_tools is not None and not isinstance(self.allowed_tools, list):
+            raise SkillValidationError("allowed_tools must be a list or None")
+
+    def to_yaml_dict(self) -> dict[str, Any]:
+        """Render to a dict ready for YAML dumping (uses spec key names)."""
+        out: dict[str, Any] = {
+            "name": self.name,
+            "description": self.description,
+            "version": self.version,
+        }
+        if self.model is not None:
+            out["model"] = self.model
+        if self.allowed_tools is not None:
+            out["allowed-tools"] = list(self.allowed_tools)
+        return out
+
+
+@dataclass
+class SkillPackage:
+    """A complete Skill ready to be archived and uploaded."""
+
+    frontmatter: SkillFrontmatter
+    body: str
+    bundled_files: dict[str, bytes] = field(default_factory=dict)
+
+
+# ---------------------------------------------------------------------------
+# Workflow conversion
+# ---------------------------------------------------------------------------
+
+
+def _slugify(value: str) -> str:
+    """Convert any string into a Skill-safe kebab-case slug."""
+    value = value.lower().strip()
+    # Drop reserved tokens before they trip validation.
+    for token in _RESERVED_TOKENS:
+        value = value.replace(token, "")
+    # Replace anything not alnum with a hyphen.
+    value = re.sub(r"[^a-z0-9]+", "-", value)
+    value = value.strip("-")
+    if not value:
+        value = "workflow"
+    # Collapse repeated hyphens left over from token removal.
+    value = re.sub(r"-{2,}", "-", value)
+    return value[:_NAME_MAX].rstrip("-") or "workflow"
+
+
+def _truncate(text: str, limit: int) -> str:
+    if len(text) <= limit:
+        return text
+    return text[: max(0, limit - 1)].rstrip() + "."
+
+
+def workflow_to_skill(workflow_yaml: str) -> SkillPackage:
+    """Convert a Sandcastle workflow YAML string into a SkillPackage.
+
+    Generates a markdown body that documents the workflow's inputs, steps, and
+    example invocation so that an agent can decide whether to trigger it.
+    """
+    data = yaml.safe_load(workflow_yaml) or {}
+    if not isinstance(data, dict):
+        raise SkillValidationError("workflow YAML must be a mapping at the top level")
+
+    raw_name = str(data.get("name") or "workflow").strip()
+    slug = _slugify(raw_name)
+
+    raw_desc = str(data.get("description") or f"Sandcastle workflow: {raw_name}").strip()
+    description = _truncate(raw_desc, _DESC_MAX)
+
+    frontmatter = SkillFrontmatter(name=slug, description=description)
+
+    # Build the markdown body.
+    input_schema = data.get("input_schema") or {}
+    properties = input_schema.get("properties") or {}
+    required = input_schema.get("required") or []
+    steps = data.get("steps") or []
+
+    lines: list[str] = []
+    lines.append(f"# {raw_name}")
+    lines.append("")
+    lines.append(raw_desc)
+    lines.append("")
+    lines.append("## Inputs")
+    lines.append("")
+    if properties:
+        for key, spec in properties.items():
+            spec = spec if isinstance(spec, dict) else {}
+            req = " (required)" if key in required else ""
+            type_str = spec.get("type", "any")
+            desc = spec.get("description", "")
+            lines.append(f"- `{key}` ({type_str}){req}: {desc}".rstrip())
+    else:
+        lines.append("- No structured inputs declared.")
+    lines.append("")
+
+    lines.append("## Steps")
+    lines.append("")
+    if steps:
+        for idx, step in enumerate(steps, start=1):
+            step = step if isinstance(step, dict) else {}
+            step_id = step.get("id", f"step-{idx}")
+            depends = step.get("depends_on") or []
+            dep_str = f" (after: {', '.join(depends)})" if depends else ""
+            model = step.get("model")
+            model_str = f" [model: {model}]" if model else ""
+            lines.append(f"{idx}. **{step_id}**{dep_str}{model_str}")
+    else:
+        lines.append("- No steps defined.")
+    lines.append("")
+
+    lines.append("## Example invocation")
+    lines.append("")
+    example_input = {key: f"<{key}>" for key in properties} or {"input": "<value>"}
+    lines.append("```json")
+    lines.append(json.dumps({"workflow": slug, "input": example_input}, indent=2))
+    lines.append("```")
+    lines.append("")
+
+    body = "\n".join(lines)
+    return SkillPackage(frontmatter=frontmatter, body=body)
+
+
+# ---------------------------------------------------------------------------
+# Serialization (tar.gz)
+# ---------------------------------------------------------------------------
+
+
+def _render_skill_md(package: SkillPackage) -> bytes:
+    fm_yaml = yaml.safe_dump(
+        package.frontmatter.to_yaml_dict(),
+        sort_keys=False,
+        default_flow_style=False,
+        allow_unicode=True,
+    ).strip()
+    text = f"---\n{fm_yaml}\n---\n\n{package.body}"
+    if not text.endswith("\n"):
+        text += "\n"
+    return text.encode("utf-8")
+
+
+def serialize_skill(package: SkillPackage) -> bytes:
+    """Serialize a SkillPackage into a tar.gz blob ready for upload."""
+    buf = io.BytesIO()
+    with tarfile.open(fileobj=buf, mode="w:gz") as tar:
+        skill_md = _render_skill_md(package)
+        info = tarfile.TarInfo(name="SKILL.md")
+        info.size = len(skill_md)
+        info.mode = 0o644
+        tar.addfile(info, io.BytesIO(skill_md))
+
+        for rel_path, content in package.bundled_files.items():
+            # Disallow absolute or escaping paths.
+            clean = rel_path.lstrip("/").replace("..", "")
+            if not clean or clean == "SKILL.md":
+                continue
+            f_info = tarfile.TarInfo(name=clean)
+            f_info.size = len(content)
+            f_info.mode = 0o644
+            tar.addfile(f_info, io.BytesIO(content))
+    return buf.getvalue()
+
+
+def _split_frontmatter(text: str) -> tuple[dict[str, Any], str]:
+    if not text.startswith("---"):
+        raise SkillValidationError("SKILL.md missing YAML frontmatter")
+    parts = text.split("---", 2)
+    if len(parts) < 3:
+        raise SkillValidationError("SKILL.md frontmatter is not terminated")
+    fm_raw = parts[1].strip()
+    body = parts[2].lstrip("\n")
+    fm = yaml.safe_load(fm_raw) or {}
+    if not isinstance(fm, dict):
+        raise SkillValidationError("SKILL.md frontmatter must be a mapping")
+    return fm, body
+
+
+def parse_skill(blob: bytes) -> SkillPackage:
+    """Inverse of serialize_skill: read a tar.gz blob into a SkillPackage."""
+    if not blob:
+        raise SkillValidationError("empty Skill archive")
+    try:
+        tar = tarfile.open(fileobj=io.BytesIO(blob), mode="r:gz")
+    except (tarfile.TarError, OSError) as exc:
+        raise SkillValidationError(f"not a valid tar.gz archive: {exc}") from exc
+
+    bundled: dict[str, bytes] = {}
+    skill_md_bytes: bytes | None = None
+    try:
+        for member in tar.getmembers():
+            if not member.isfile():
+                continue
+            f = tar.extractfile(member)
+            if f is None:
+                continue
+            data = f.read()
+            if member.name == "SKILL.md":
+                skill_md_bytes = data
+            else:
+                bundled[member.name] = data
+    finally:
+        tar.close()
+
+    if skill_md_bytes is None:
+        raise SkillValidationError("archive missing SKILL.md")
+
+    fm_dict, body = _split_frontmatter(skill_md_bytes.decode("utf-8"))
+    allowed_tools = fm_dict.get("allowed-tools")
+    if allowed_tools is None:
+        allowed_tools = fm_dict.get("allowed_tools")
+    frontmatter = SkillFrontmatter(
+        name=str(fm_dict.get("name", "")),
+        description=str(fm_dict.get("description", "")),
+        version=str(fm_dict.get("version", "1.0.0")),
+        model=fm_dict.get("model"),
+        allowed_tools=allowed_tools,
+    )
+    return SkillPackage(frontmatter=frontmatter, body=body.rstrip("\n"), bundled_files=bundled)
+
+
+# ---------------------------------------------------------------------------
+# HTTP client
+# ---------------------------------------------------------------------------
+
+
+class AnthropicSkillsClient:
+    """Async client for the Anthropic Skills API.
+
+    The Skills API is in beta and requires the `anthropic-beta` header listing
+    the feature flags the request opts into. Defaults cover Skills, Code
+    Execution, and the Files API (used for bundled file references).
+    """
+
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str = "https://api.anthropic.com",
+        beta_headers: list[str] | None = None,
+        timeout: float = 30.0,
+    ) -> None:
+        self._api_key = api_key
+        self._base_url = base_url.rstrip("/")
+        self._beta_headers = list(beta_headers) if beta_headers is not None else list(
+            DEFAULT_BETA_HEADERS
+        )
+        self._timeout = timeout
+
+    def _headers(self) -> dict[str, str]:
+        return {
+            "x-api-key": self._api_key,
+            "anthropic-version": "2023-06-01",
+            "anthropic-beta": ", ".join(self._beta_headers),
+        }
+
+    async def upload(self, package: SkillPackage) -> dict[str, Any]:
+        """Upload a SkillPackage as a multipart POST to /v1/skills."""
+        blob = serialize_skill(package)
+        filename = f"{package.frontmatter.name}.tar.gz"
+        files = {
+            "skill": (filename, blob, "application/gzip"),
+        }
+        data = {
+            "name": package.frontmatter.name,
+            "version": package.frontmatter.version,
+            "description": package.frontmatter.description,
+        }
+        async with httpx.AsyncClient(timeout=self._timeout) as client:
+            resp = await client.post(
+                f"{self._base_url}/v1/skills",
+                headers=self._headers(),
+                data=data,
+                files=files,
+            )
+            resp.raise_for_status()
+            return resp.json()
+
+    async def list_skills(self) -> list[dict[str, Any]]:
+        async with httpx.AsyncClient(timeout=self._timeout) as client:
+            resp = await client.get(
+                f"{self._base_url}/v1/skills",
+                headers=self._headers(),
+            )
+            resp.raise_for_status()
+            payload = resp.json()
+            if isinstance(payload, dict) and "data" in payload:
+                return list(payload["data"])
+            if isinstance(payload, list):
+                return payload
+            return []
+
+    async def get_skill(self, skill_id: str) -> dict[str, Any]:
+        async with httpx.AsyncClient(timeout=self._timeout) as client:
+            resp = await client.get(
+                f"{self._base_url}/v1/skills/{skill_id}",
+                headers=self._headers(),
+            )
+            resp.raise_for_status()
+            return resp.json()
+
+    async def delete_skill(self, skill_id: str) -> None:
+        async with httpx.AsyncClient(timeout=self._timeout) as client:
+            resp = await client.delete(
+                f"{self._base_url}/v1/skills/{skill_id}",
+                headers=self._headers(),
+            )
+            resp.raise_for_status()
+
+
+# ---------------------------------------------------------------------------
+# CLI helper
+# ---------------------------------------------------------------------------
+
+
+async def publish_workflows_as_skills(
+    workflow_dir: str,
+    dry_run: bool = True,
+    client: AnthropicSkillsClient | None = None,
+) -> list[dict[str, Any]]:
+    """Scan a directory for .yaml workflows and publish each as a Skill.
+
+    When ``dry_run`` is True (the default) nothing is uploaded; the function
+    returns one result per workflow describing what would have happened. When
+    False, ``client`` must be provided and each Skill is POSTed via
+    ``client.upload``.
+    """
+    root = Path(workflow_dir)
+    if not root.exists() or not root.is_dir():
+        raise SkillValidationError(f"workflow_dir not found: {workflow_dir}")
+
+    results: list[dict[str, Any]] = []
+    paths = sorted(
+        [*root.glob("*.yaml"), *root.glob("*.yml")],
+        key=lambda p: p.name,
+    )
+    for path in paths:
+        try:
+            text = path.read_text(encoding="utf-8")
+            package = workflow_to_skill(text)
+        except (SkillValidationError, yaml.YAMLError) as exc:
+            results.append({
+                "path": str(path),
+                "status": "error",
+                "error": str(exc),
+            })
+            continue
+
+        entry: dict[str, Any] = {
+            "path": str(path),
+            "name": package.frontmatter.name,
+            "description": package.frontmatter.description,
+        }
+        if dry_run:
+            entry["status"] = "dry_run"
+            entry["message"] = f"would upload {package.frontmatter.name}"
+        else:
+            if client is None:
+                raise SkillValidationError(
+                    "client is required when dry_run=False"
+                )
+            response = await client.upload(package)
+            entry["status"] = "uploaded"
+            entry["response"] = response
+        results.append(entry)
+    return results
+
+
+__all__ = [
+    "DEFAULT_BETA_HEADERS",
+    "SkillFrontmatter",
+    "SkillPackage",
+    "SkillValidationError",
+    "workflow_to_skill",
+    "serialize_skill",
+    "parse_skill",
+    "AnthropicSkillsClient",
+    "publish_workflows_as_skills",
+]
diff --git a/src/sandcastle/engine/computer_use.py b/src/sandcastle/engine/computer_use.py
new file mode 100644
index 00000000..6dabe825
--- /dev/null
+++ b/src/sandcastle/engine/computer_use.py
@@ -0,0 +1,211 @@
+"""Computer Use integration helper for Sandcastle.
+
+Anthropic's Computer Use beta lets the model drive a sandboxed VM via three
+tools: ``bash``, ``text_editor`` and ``computer`` (mouse, keyboard, screenshot).
+
+This module is intentionally self-contained: it only builds tool definitions,
+beta headers, and validates configuration. It does not perform any network
+calls to the Anthropic API. Wiring into the executor happens in a separate
+phase.
+
+Anthropic mandates three operational requirements when using Computer Use:
+
+1. Run the target environment inside a sandboxed VM or container.
+2. Keep the prompt-injection classifier enabled (default on).
+3. Require human-in-the-loop confirmation for consequential actions.
+
+The :data:`SAFETY_CHECKLIST` and :func:`should_pause_for_approval` helpers
+exist to make those requirements easy to satisfy from workflow code.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Literal
+
+
+ToolName = Literal["bash", "text_editor", "computer"]
+
+
+# Display bounds enforced by Anthropic for the computer tool.
+MIN_DISPLAY_WIDTH_PX = 640
+MAX_DISPLAY_WIDTH_PX = 2576  # Opus 4.7 upper bound.
+
+# Beta header strings (kept as module constants for easy reuse in tests).
+BETA_HEADER_2025_11_24 = "computer-use-2025-11-24"
+BETA_HEADER_2025_01_24 = "computer-use-2025-01-24"
+
+# Models that use the newer 2025-11-24 beta header.
+_NEW_BETA_MODELS = {
+    "claude-opus-4-7",
+    "claude-sonnet-4-6",
+    "claude-haiku-4-5",
+}
+
+# Older Sonnet / Opus generations still on the 2025-01-24 header.
+_OLD_BETA_MODELS = {
+    "claude-sonnet-4-5",
+    "claude-opus-4-5",
+}
+
+
+SAFETY_CHECKLIST: list[str] = [
+    "Run the target environment inside a sandboxed VM or ephemeral container - never on a host with user data.",
+    "Keep Anthropic's prompt-injection classifier enabled (it is on by default; do not disable it).",
+    "Maintain an allowlist of domains the agent is permitted to navigate to and block everything else at the network layer.",
+    "Require explicit human approval for consequential actions (purchases, form submissions, destructive shell commands, sending email).",
+    "Strip secrets, cookies, and tokens from any screenshots or transcripts before logging or persisting them.",
+    "Set a hard wall-clock timeout and a maximum tool-call budget per session so a runaway loop cannot drain resources.",
+    "Audit every bash and text_editor call: log the command, the working directory, and the exit code to the Sandcastle audit trail.",
+    "Tear the sandbox down at the end of each session - do not reuse VMs across users or workflows.",
+]
+
+
+class ComputerUseError(Exception):
+    """Raised for Computer Use configuration or validation failures."""
+
+
+@dataclass
+class ComputerUseConfig:
+    """Configuration for a Computer Use enabled agent session."""
+
+    display_width_px: int = 1280
+    display_height_px: int = 800
+    tools: list[ToolName] = field(
+        default_factory=lambda: ["bash", "text_editor", "computer"]
+    )
+    model: str = "claude-sonnet-4-6"
+    require_human_approval_for: list[str] = field(
+        default_factory=lambda: ["mouse_click", "key_combo", "submit_form"]
+    )
+    sandbox_url: str | None = None
+
+
+def validate_config(config: ComputerUseConfig) -> list[str]:
+    """Validate a :class:`ComputerUseConfig`.
+
+    Returns a list of issue strings. Hard errors are returned as plain
+    sentences, soft warnings are prefixed with ``WARN:`` so callers can
+    decide whether to fail or merely log.
+    """
+
+    issues: list[str] = []
+
+    if config.display_width_px > MAX_DISPLAY_WIDTH_PX:
+        issues.append(
+            f"display_width_px {config.display_width_px} exceeds maximum "
+            f"{MAX_DISPLAY_WIDTH_PX}px supported by Opus 4.7."
+        )
+    if config.display_width_px < MIN_DISPLAY_WIDTH_PX:
+        issues.append(
+            f"display_width_px {config.display_width_px} is below minimum "
+            f"{MIN_DISPLAY_WIDTH_PX}px."
+        )
+
+    if not config.tools:
+        issues.append("tools list is empty; at least one tool must be enabled.")
+
+    if not config.require_human_approval_for:
+        issues.append(
+            "WARN: require_human_approval_for is empty; the agent will not "
+            "pause for any actions. Anthropic recommends human-in-loop "
+            "approval for consequential actions."
+        )
+
+    return issues
+
+
+def build_tool_definitions(config: ComputerUseConfig) -> list[dict]:
+    """Return Anthropic-format tool definitions for the enabled tools."""
+
+    definitions: list[dict] = []
+
+    for tool in config.tools:
+        if tool == "bash":
+            definitions.append({"type": "bash_20250124", "name": "bash"})
+        elif tool == "text_editor":
+            definitions.append(
+                {"type": "text_editor_20250124", "name": "str_replace_editor"}
+            )
+        elif tool == "computer":
+            definitions.append(
+                {
+                    "type": "computer_20251124",
+                    "name": "computer",
+                    "display_width_px": config.display_width_px,
+                    "display_height_px": config.display_height_px,
+                }
+            )
+        else:  # pragma: no cover - guarded by Literal typing.
+            raise ComputerUseError(f"Unknown tool: {tool}")
+
+    return definitions
+
+
+def build_beta_header(model: str) -> str:
+    """Return the Computer Use beta header string for ``model``.
+
+    Opus 4.7, Sonnet 4.6 and Haiku 4.5 use the newer ``computer-use-2025-11-24``
+    header. Older Sonnet 4.5 / Opus 4.5 generations remain on the
+    ``computer-use-2025-01-24`` header.
+    """
+
+    if model in _NEW_BETA_MODELS:
+        return BETA_HEADER_2025_11_24
+    if model in _OLD_BETA_MODELS:
+        return BETA_HEADER_2025_01_24
+
+    # Default: assume the newer header for unknown / future model ids.
+    return BETA_HEADER_2025_11_24
+
+
+def should_pause_for_approval(
+    tool_use: dict, config: ComputerUseConfig
+) -> bool:
+    """Decide whether a tool_use event should pause for human approval.
+
+    The check compares against both the raw tool name (``bash``,
+    ``str_replace_editor`` etc.) and the dotted form ``<tool>.<action>``
+    (for example ``computer.mouse_click``). An action sub-type is read from
+    ``tool_use['input']['action']`` if present.
+    """
+
+    if not config.require_human_approval_for:
+        return False
+
+    name = tool_use.get("name") or ""
+    action = ""
+    raw_input = tool_use.get("input") or {}
+    if isinstance(raw_input, dict):
+        action = str(raw_input.get("action") or "")
+
+    candidates = {name}
+    if action:
+        candidates.add(action)
+        candidates.add(f"{name}.{action}")
+
+    return any(item in config.require_human_approval_for for item in candidates if item)
+
+
+def print_safety_checklist() -> None:
+    """Print the Computer Use safety checklist to stdout."""
+
+    print("Computer Use safety pre-flight checklist:")
+    for idx, item in enumerate(SAFETY_CHECKLIST, start=1):
+        print(f"  {idx}. {item}")
+
+
+__all__ = [
+    "BETA_HEADER_2025_01_24",
+    "BETA_HEADER_2025_11_24",
+    "ComputerUseConfig",
+    "ComputerUseError",
+    "MAX_DISPLAY_WIDTH_PX",
+    "MIN_DISPLAY_WIDTH_PX",
+    "SAFETY_CHECKLIST",
+    "build_beta_header",
+    "build_tool_definitions",
+    "print_safety_checklist",
+    "should_pause_for_approval",
+    "validate_config",
+]
diff --git a/src/sandcastle/engine/dag.py b/src/sandcastle/engine/dag.py
index 4122c226..7eb4096a 100644
--- a/src/sandcastle/engine/dag.py
+++ b/src/sandcastle/engine/dag.py
@@ -312,10 +312,63 @@ class ManagedAgentConfig:
     output_format: str = "text"     # "text" | "json" | "files" | "markdown"
     # Agent collaboration: mount file outputs from previous steps
     shared_files: list[str] | None = None  # step IDs whose file outputs to mount
-    # Retry with different template on failure
-    fallback_template: str = ""     # retry with this template if primary fails
+    # Retry with different template on failure - accepts a single template name
+    # or an ordered list walked left-to-right until one succeeds.
+    fallback_template: str | list[str] = ""
     # Runtime abstraction: "auto" | "anthropic" | "local"
     runtime: str = "auto"
+    # Sampling controls forwarded to the agent-create call. When None, the
+    # field is omitted from the request and Anthropic uses its default.
+    temperature: float | None = None
+    max_tokens: int | None = None
+    thinking_budget: int | None = None
+    # v0.32 prep: Memory Stores, multiagent coordinator, outcomes
+    memory_stores: list[str] | None = None  # IDs of memory_stores to mount
+    multiagent: dict | None = None          # raw multiagent config (validated at runtime)
+    outcomes: list[dict] | None = None      # list of OutcomeDefinition payloads
+
+
+@dataclass
+class TrajectoryReplayConfig:
+    """Configuration for a ``type: trajectory-replay`` step.
+
+    Mirrors the public types in ``sandcastle.engine.trajectory_replay``.
+    The step loads a golden trajectory by ``golden_run_id``, extracts the
+    candidate trajectory from the current run, diffs them, and fails when
+    either the replay score drops below ``fail_below_score`` or the
+    cost delta exceeds ``allow_cost_delta_pct`` percent.
+    """
+
+    golden_run_id: str = ""
+    fail_below_score: float = 0.8
+    allow_cost_delta_pct: float = 10.0
+    weights: dict | None = None  # forwarded to replay_score()
+    cost_budget_usd: float = 0.01
+
+
+@dataclass
+class ComputerUseStepConfig:
+    """Configuration for a ``type: computer-use`` step.
+
+    Mirrors :class:`sandcastle.engine.computer_use.ComputerUseConfig` but
+    is intentionally a plain dataclass on the YAML side so the executor
+    can build the actual :class:`ComputerUseConfig` at runtime without
+    coupling the DAG parser to the computer_use module.
+    """
+
+    display_width_px: int = 1280
+    display_height_px: int = 800
+    tools: list[str] = field(
+        default_factory=lambda: ["bash", "text_editor", "computer"]
+    )
+    model: str = "claude-sonnet-4-6"
+    require_human_approval_for: list[str] = field(
+        default_factory=lambda: ["mouse_click", "key_combo", "submit_form"]
+    )
+    sandbox_url: str | None = None
+    message: str = ""  # initial message / task description
+    max_turns: int = 20
+    timeout: int = 600
 
 
 @dataclass
@@ -436,6 +489,8 @@ class MemoryConfig:
         "report",
         "managed-agent",
         "agent",
+        "trajectory-replay",
+        "computer-use",
     }
 )
 
@@ -444,7 +499,7 @@ class MemoryConfig:
     {
         "http", "code", "condition", "loop", "race", "sensor", "gate",
         "transform", "notify", "composio", "openclaw", "parse",
-        "managed-agent", "agent",
+        "managed-agent", "agent", "trajectory-replay", "computer-use",
     }
 )
 
@@ -453,7 +508,7 @@ class MemoryConfig:
     {
         "http", "code", "condition", "loop", "race", "sensor",
         "transform", "notify", "composio", "openclaw", "parse",
-        "managed-agent", "agent",
+        "managed-agent", "agent", "trajectory-replay", "computer-use",
     }
 )
 
@@ -505,6 +560,8 @@ class StepDefinition:
     parse_config: ParseConfig | None = None
     report_config: ReportConfig | None = None
     managed_agent_config: ManagedAgentConfig | None = None
+    trajectory_replay_config: dict | None = None
+    computer_use_config: dict | None = None
     # Dynamic context retrieval before execution
     context_query: str = ""  # Search query to fetch relevant context
     context_source: str = "memory"  # "memory" | "web" | "files" | "custom"
@@ -1107,6 +1164,10 @@ def _parse_managed_agent_config(data: dict | None) -> ManagedAgentConfig | None:
     shared = data.get("shared_files")
     if shared is not None and not isinstance(shared, list):
         shared = [str(shared)]
+    # fallback_template accepts a single template name (str) or a list of names
+    fb = data.get("fallback_template", "")
+    if isinstance(fb, list):
+        fb = [str(x) for x in fb]
     return ManagedAgentConfig(
         agent_id=data.get("agent_id", ""),
         environment_id=data.get("environment_id", ""),
@@ -1122,8 +1183,26 @@ def _parse_managed_agent_config(data: dict | None) -> ManagedAgentConfig | None:
         describe=data.get("describe", ""),
         output_format=data.get("output_format", "text"),
         shared_files=shared,
-        fallback_template=data.get("fallback_template", ""),
+        fallback_template=fb,
         runtime=data.get("runtime", "auto"),
+        temperature=data.get("temperature"),
+        max_tokens=data.get("max_tokens"),
+        thinking_budget=data.get("thinking_budget"),
+        memory_stores=(
+            [str(s) for s in data["memory_stores"]]
+            if isinstance(data.get("memory_stores"), list)
+            else None
+        ),
+        multiagent=(
+            dict(data["multiagent"])
+            if isinstance(data.get("multiagent"), dict)
+            else None
+        ),
+        outcomes=(
+            [dict(o) for o in data["outcomes"] if isinstance(o, dict)]
+            if isinstance(data.get("outcomes"), list)
+            else None
+        ),
     )
 
 
@@ -1223,6 +1302,16 @@ def _parse_step(data: dict, defaults: dict) -> StepDefinition:
             data.get("managed_agent_config") if "managed_agent_config" in data
             else data.get("agent_config")
         ),
+        trajectory_replay_config=(
+            dict(data["trajectory_replay_config"])
+            if isinstance(data.get("trajectory_replay_config"), dict)
+            else None
+        ),
+        computer_use_config=(
+            dict(data["computer_use_config"])
+            if isinstance(data.get("computer_use_config"), dict)
+            else None
+        ),
         # Dynamic context retrieval
         context_query=data.get("context_query", ""),
         context_source=_validate_context_source(data.get("context_source", "memory")),
@@ -1549,12 +1638,18 @@ def validate(workflow: WorkflowDefinition) -> list[str]:
                     )
             if cfg and cfg.fallback_template:
                 from sandcastle.engine.agent_templates import VALID_AGENT_TEMPLATES as _vat
-                if cfg.fallback_template not in _vat:
-                    errors.append(
-                        f"Managed-agent step '{step.id}' has unknown fallback_template "
-                        f"'{cfg.fallback_template}'. Valid templates: "
-                        f"{', '.join(sorted(_vat))}"
-                    )
+                # Accept either a single name (str) or an ordered list of names
+                if isinstance(cfg.fallback_template, str):
+                    _fb_chain: list[str] = [cfg.fallback_template]
+                else:
+                    _fb_chain = list(cfg.fallback_template)
+                for _fb in _fb_chain:
+                    if _fb and _fb not in _vat:
+                        errors.append(
+                            f"Managed-agent step '{step.id}' has unknown fallback_template "
+                            f"'{_fb}'. Valid templates: "
+                            f"{', '.join(sorted(_vat))}"
+                        )
             if cfg and cfg.output_format not in ("text", "json", "files", "markdown"):
                 errors.append(
                     f"Managed-agent step '{step.id}' has invalid output_format "
diff --git a/src/sandcastle/engine/executor.py b/src/sandcastle/engine/executor.py
index 36fb23be..86cfb352 100644
--- a/src/sandcastle/engine/executor.py
+++ b/src/sandcastle/engine/executor.py
@@ -3166,6 +3166,35 @@ async def _execute_openclaw_step(
 _managed_env_cache: dict[str, str] = {}     # cache_key -> environment_id
 
 
+# Per-million-token pricing (input, output) used for cost accounting on
+# managed-agent steps. Unknown models fall back to Sonnet 4.6 rates and the
+# warning is suppressed after the first occurrence per process per model.
+_AGENT_MODEL_PRICING: dict[str, tuple[float, float]] = {
+    "claude-opus-4-7": (5.0, 25.0),
+    "claude-opus-4-6": (15.0, 75.0),
+    "claude-sonnet-4-6": (3.0, 15.0),
+    "claude-sonnet-4-5": (3.0, 15.0),
+    "claude-haiku-4-5": (1.0, 5.0),
+}
+_AGENT_PRICING_FALLBACK: tuple[float, float] = (3.0, 15.0)
+_warned_unknown_agent_models: set[str] = set()
+
+
+def _agent_model_pricing(model: str) -> tuple[float, float]:
+    """Return (input_per_mtok, output_per_mtok) for a managed-agent model."""
+    price = _AGENT_MODEL_PRICING.get(model)
+    if price is not None:
+        return price
+    if model not in _warned_unknown_agent_models:
+        _warned_unknown_agent_models.add(model)
+        logger.warning(
+            "Unknown managed-agent model '%s'; falling back to Sonnet 4.6 pricing %s",
+            model,
+            _AGENT_PRICING_FALLBACK,
+        )
+    return _AGENT_PRICING_FALLBACK
+
+
 def _managed_agent_cache_key(step: StepDefinition) -> str:
     """Build a deterministic cache key from step config for agent reuse."""
     cfg = step.managed_agent_config
@@ -3417,13 +3446,68 @@ def _classify_http_error(status_code: int, body: str = "") -> str:
                 if cached_agent:
                     agent_id = cached_agent
                 else:
+                    # Wire tools_enabled: bare tool names get wrapped as {type: name};
+                    # when unset/empty, fall back to the default managed toolset.
+                    if config.tools_enabled:
+                        tools_payload = [{"type": t} for t in config.tools_enabled]
+                    else:
+                        tools_payload = [{"type": "agent_toolset_20260401"}]
                     agent_payload: dict = {
                         "name": f"sandcastle-{step.id}",
                         "model": config.model,
-                        "tools": [{"type": "agent_toolset_20260401"}],
+                        "tools": tools_payload,
                     }
                     if config.system_prompt:
                         agent_payload["system"] = config.system_prompt
+                    # Sampling params: only forward when explicitly set; otherwise
+                    # let Anthropic apply its defaults.
+                    if config.temperature is not None:
+                        agent_payload["temperature"] = config.temperature
+                    if config.max_tokens is not None:
+                        agent_payload["max_tokens"] = config.max_tokens
+                    if config.thinking_budget is not None:
+                        agent_payload["thinking"] = {
+                            "type": "enabled",
+                            "budget_tokens": config.thinking_budget,
+                        }
+                    # Multiagent coordinator wiring (v0.32 prep). Validation
+                    # surfaces as a step.failed; we do not attempt fallback.
+                    if config.multiagent:
+                        from sandcastle.engine.multiagent import (
+                            MultiagentConfig,
+                            MultiagentRosterEntry,
+                            RosterValidationError,
+                            build_coordinator_payload,
+                        )
+                        try:
+                            roster_data = config.multiagent.get("roster", [])
+                            roster = [
+                                MultiagentRosterEntry(
+                                    type=str(r.get("type", "agent")),
+                                    id=r.get("id"),
+                                    version=r.get("version"),
+                                    nickname=r.get("nickname"),
+                                )
+                                for r in roster_data
+                                if isinstance(r, dict)
+                            ]
+                            ma_cfg = MultiagentConfig(
+                                roster=roster,
+                                max_concurrent_threads=int(
+                                    config.multiagent.get("max_concurrent_threads", 25)
+                                ),
+                                prompt_routing_hint=config.multiagent.get(
+                                    "prompt_routing_hint"
+                                ),
+                            )
+                            agent_payload.update(build_coordinator_payload(ma_cfg))
+                        except RosterValidationError as exc:
+                            return StepResult(
+                                step_id=step.id,
+                                status="failed",
+                                error=f"Invalid multiagent roster: {exc}",
+                                duration_seconds=time.monotonic() - started_at,
+                            )
                     agent_resp = await client.post(
                         f"{base_url}/agents", headers=headers, json=agent_payload,
                     )
@@ -3477,6 +3561,29 @@ def _classify_http_error(status_code: int, body: str = "") -> str:
                 "agent": agent_id,
                 "environment_id": env_id,
             }
+            # Memory Stores attachment (v0.32 prep). Server-side cap of 8 is
+            # mirrored client-side via MemoryStoresClient.attach_to_session_payload.
+            if config.memory_stores:
+                from sandcastle.engine.memory_stores import (
+                    MemoryStoresClient,
+                    MemoryStoresLimitError,
+                )
+                try:
+                    mem_resources = MemoryStoresClient.attach_to_session_payload(
+                        list(config.memory_stores)
+                    )
+                except MemoryStoresLimitError as exc:
+                    return StepResult(
+                        step_id=step.id,
+                        status="failed",
+                        error=str(exc),
+                        duration_seconds=time.monotonic() - started_at,
+                    )
+                existing = session_body.get("resources")
+                if isinstance(existing, list):
+                    existing.extend(mem_resources)
+                else:
+                    session_body["resources"] = list(mem_resources)
             session_resp = await client.post(
                 f"{base_url}/sessions", headers=headers, json=session_body,
             )
@@ -3492,6 +3599,40 @@ def _classify_http_error(status_code: int, body: str = "") -> str:
                 )
             session_id = session_resp.json()["id"]
 
+            # --- Define outcomes (v0.32 prep) ---
+            outcome_post_errors: list[str] = []
+            if config.outcomes:
+                from sandcastle.engine.outcomes import (
+                    OutcomeDefinition,
+                    OutcomeValidationError,
+                    build_define_outcome_event,
+                )
+                for raw_def in config.outcomes:
+                    try:
+                        definition = OutcomeDefinition(
+                            name=str(raw_def.get("name", "")),
+                            description=str(raw_def.get("description", "")),
+                            success_criteria=list(raw_def.get("success_criteria") or []),
+                            weight=float(raw_def.get("weight", 1.0)),
+                            model=raw_def.get("model"),
+                        )
+                    except (OutcomeValidationError, TypeError, ValueError) as exc:
+                        outcome_post_errors.append(
+                            f"outcome '{raw_def.get('name', '?')}': {exc}"
+                        )
+                        continue
+                    body = build_define_outcome_event(definition)
+                    try:
+                        await client.post(
+                            f"{base_url}/sessions/{session_id}/events",
+                            headers=headers,
+                            json={"events": [body]},
+                        )
+                    except Exception as exc:  # noqa: BLE001 - surface via output
+                        outcome_post_errors.append(
+                            f"outcome '{definition.name}': POST failed: {exc}"
+                        )
+
             # --- Send user message with content blocks format ---
             await client.post(
                 f"{base_url}/sessions/{session_id}/events",
@@ -3505,9 +3646,19 @@ def _classify_http_error(status_code: int, body: str = "") -> str:
             )
 
             # --- Stream SSE response ---
+            # stream=True (default): assemble text incrementally as events arrive.
+            # stream=False: buffer events server-side and only assemble the final
+            # text at the end, never surfacing intermediate deltas.
             result_text = ""
             total_input_tokens = 0
             total_output_tokens = 0
+            buffered_events: list[dict] = []
+            outcome_evaluations: list[dict] = []
+
+            from sandcastle.engine.outcomes import (
+                OUTCOME_EVAL_END_TYPE,
+                parse_outcome_evaluation,
+            )
 
             async with client.stream(
                 "GET",
@@ -3524,17 +3675,34 @@ def _classify_http_error(status_code: int, body: str = "") -> str:
 
                     event_type = event.get("type", "")
 
-                    # Collect agent text output
-                    if event_type == "agent.message":
-                        for block in event.get("content", []):
-                            if block.get("type") == "text":
-                                result_text += block.get("text", "")
-
-                    # Track token usage from any event that includes it
-                    usage = event.get("usage")
-                    if usage:
-                        total_input_tokens += usage.get("input_tokens", 0)
-                        total_output_tokens += usage.get("output_tokens", 0)
+                    if config.stream:
+                        # Collect agent text output incrementally
+                        if event_type == "agent.message":
+                            for block in event.get("content", []):
+                                if block.get("type") == "text":
+                                    result_text += block.get("text", "")
+
+                        # Track token usage from any event that includes it
+                        usage = event.get("usage")
+                        if usage:
+                            total_input_tokens += usage.get("input_tokens", 0)
+                            total_output_tokens += usage.get("output_tokens", 0)
+                    else:
+                        # Buffer everything; nothing is surfaced mid-stream
+                        buffered_events.append(event)
+
+                    # Capture outcome evaluations regardless of stream mode.
+                    if event_type == OUTCOME_EVAL_END_TYPE:
+                        ev = parse_outcome_evaluation(event)
+                        if ev is not None:
+                            outcome_evaluations.append({
+                                "name": ev.outcome_name,
+                                "passed": ev.passed,
+                                "score": ev.score,
+                                "reasoning": ev.reasoning,
+                                "evaluator_model": ev.evaluator_model,
+                                "cost_usd": ev.cost_usd,
+                            })
 
                     # Session finished: agent idle or session terminated
                     if event_type in (
@@ -3543,10 +3711,23 @@ def _classify_http_error(status_code: int, body: str = "") -> str:
                     ):
                         break
 
-            # Compute cost (Sonnet pricing by default)
+            if not config.stream:
+                # Assemble final result from buffered events only at the end
+                for event in buffered_events:
+                    if event.get("type") == "agent.message":
+                        for block in event.get("content", []):
+                            if block.get("type") == "text":
+                                result_text += block.get("text", "")
+                    usage = event.get("usage")
+                    if usage:
+                        total_input_tokens += usage.get("input_tokens", 0)
+                        total_output_tokens += usage.get("output_tokens", 0)
+
+            # Compute cost using the per-model pricing table
+            in_price, out_price = _agent_model_pricing(config.model)
             cost = _safe_cost(
                 total_input_tokens, total_output_tokens,
-                3.0, 15.0,  # Default Sonnet pricing per 1M tokens
+                in_price, out_price,
             )
 
             # Process output_format for agent chaining
@@ -3565,6 +3746,20 @@ def _classify_http_error(status_code: int, body: str = "") -> str:
                     "_output_format": "files",
                 }
 
+            # Surface outcomes (v0.32 prep) when present. Wrap text output in
+            # a structured envelope so callers can read both text + outcomes.
+            if outcome_evaluations or outcome_post_errors:
+                envelope: dict[str, Any] = {
+                    "outcomes": outcome_evaluations,
+                }
+                if outcome_post_errors:
+                    envelope["outcome_errors"] = outcome_post_errors
+                if isinstance(final_output, dict):
+                    final_output = {**final_output, **envelope}
+                else:
+                    envelope["text"] = final_output
+                    final_output = envelope
+
             return StepResult(
                 step_id=step.id,
                 status="completed",
@@ -3597,19 +3792,35 @@ def _classify_http_error(status_code: int, body: str = "") -> str:
             except Exception:
                 logger.debug("Failed to delete session %s (non-critical)", session_id)
 
-    # Fallback template retry: if primary failed and fallback_template is set
+    # Fallback chain retry: accept either a single template name or an ordered
+    # list of names; walk them left-to-right with a hard safety stop.
     if config.fallback_template:
         from sandcastle.engine.agent_templates import VALID_AGENT_TEMPLATES
         from sandcastle.engine.dag import ManagedAgentConfig
-        if config.fallback_template in VALID_AGENT_TEMPLATES:
+        from copy import copy as _copy
+
+        if isinstance(config.fallback_template, str):
+            chain: list[str] = [config.fallback_template]
+        else:
+            chain = list(config.fallback_template)
+        # Hard cap to prevent runaway fallback walks
+        MAX_FALLBACKS = 5
+        chain = [c for c in chain if c][:MAX_FALLBACKS]
+
+        last_error = primary_error
+        last_template = ""
+        for fb_name in chain:
+            if fb_name not in VALID_AGENT_TEMPLATES:
+                last_error = f"Unknown fallback template: {fb_name}"
+                last_template = fb_name
+                continue
             logger.info(
                 "Step '%s' failed with template '%s', retrying with fallback '%s'",
-                step.id, config.agent_template or "(explicit)", config.fallback_template,
+                step.id, config.agent_template or "(explicit)", fb_name,
             )
-            from copy import copy
-            fallback_step = copy(step)
+            fallback_step = _copy(step)
             fallback_config = ManagedAgentConfig(
-                agent_template=config.fallback_template,
+                agent_template=fb_name,
                 message=config.message,
                 timeout=config.timeout,
                 model=config.model,
@@ -3618,6 +3829,9 @@ def _classify_http_error(status_code: int, body: str = "") -> str:
                 network_access=config.network_access,
                 output_format=config.output_format,
                 shared_files=config.shared_files,
+                temperature=config.temperature,
+                max_tokens=config.max_tokens,
+                thinking_budget=config.thinking_budget,
                 fallback_template="",  # prevent infinite retry
             )
             fallback_step.managed_agent_config = fallback_config
@@ -3626,22 +3840,280 @@ def _classify_http_error(status_code: int, body: str = "") -> str:
             )
             if fallback_result.status == "completed":
                 return fallback_result
-            # Both failed - report both errors
+            last_error = fallback_result.error or "unknown error"
+            last_template = fb_name
+
+        return StepResult(
+            step_id=step.id,
+            status="failed",
+            error=(
+                f"Primary failed: {primary_error}; "
+                f"Fallback ({last_template}) also failed: {last_error}"
+            ),
+            duration_seconds=time.monotonic() - started_at,
+        )
+
+    return StepResult(
+        step_id=step.id,
+        status="failed",
+        error=primary_error,
+        duration_seconds=time.monotonic() - started_at,
+    )
+
+
+async def _execute_trajectory_replay_step(
+    step: StepDefinition,
+    context: RunContext,
+    storage: StorageBackend | None = None,
+) -> StepResult:
+    """Execute a ``type: trajectory-replay`` step.
+
+    Loads a golden Trajectory by ``golden_run_id`` (the audit events and
+    step records for that run are pulled from the DB), extracts the
+    candidate Trajectory from the *current* run's data, diffs them, and
+    scores the result via :func:`replay_score`. Fails when the score
+    drops below ``fail_below_score`` or the absolute cost drift exceeds
+    ``allow_cost_delta_pct``.
+    """
+
+    import time
+
+    started_at = time.monotonic()
+    cfg_raw = step.trajectory_replay_config or {}
+    golden_run_id = str(cfg_raw.get("golden_run_id") or "").strip()
+    if not golden_run_id:
+        return StepResult(
+            step_id=step.id,
+            status="failed",
+            error="trajectory_replay_config.golden_run_id is required",
+            duration_seconds=time.monotonic() - started_at,
+        )
+
+    try:
+        fail_below_score = float(cfg_raw.get("fail_below_score", 0.8))
+    except (TypeError, ValueError):
+        fail_below_score = 0.8
+    try:
+        allow_cost_delta_pct = float(cfg_raw.get("allow_cost_delta_pct", 10.0))
+    except (TypeError, ValueError):
+        allow_cost_delta_pct = 10.0
+    try:
+        cost_budget_usd = float(cfg_raw.get("cost_budget_usd", 0.01))
+    except (TypeError, ValueError):
+        cost_budget_usd = 0.01
+    weights = cfg_raw.get("weights") if isinstance(cfg_raw.get("weights"), dict) else None
+
+    from sandcastle.engine.trajectory_replay import (
+        diff_trajectories,
+        extract_trajectory,
+        replay_score,
+    )
+
+    async def _load_run_data(run_id: str) -> tuple[list[dict], list[dict]]:
+        """Load audit events + run steps for a run id. Returns (events, steps)."""
+        from sqlalchemy import select
+
+        from sandcastle.models.db import AuditEvent, RunStep, async_session
+
+        events_out: list[dict] = []
+        steps_out: list[dict] = []
+        async with async_session() as session:
+            ev_rows = (
+                await session.execute(
+                    select(AuditEvent).where(AuditEvent.run_id == run_id)
+                )
+            ).scalars().all()
+            for ev in ev_rows:
+                payload = ev.payload if isinstance(ev.payload, dict) else {}
+                events_out.append({
+                    "event_type": ev.event_type,
+                    "step_id": payload.get("step_id"),
+                    "ts": ev.created_at,
+                    "data": payload,
+                })
+            step_rows = (
+                await session.execute(
+                    select(RunStep).where(RunStep.run_id == run_id)
+                )
+            ).scalars().all()
+            for s in step_rows:
+                output = s.output_data if isinstance(s.output_data, dict) else {}
+                steps_out.append({
+                    "step_id": s.step_id,
+                    "tool_name": (output.get("tool_name") if isinstance(output, dict) else "") or "",
+                    "args": output.get("args", {}) if isinstance(output, dict) else {},
+                    "output": output,
+                    "error": s.error,
+                    "cost_usd": float(s.cost_usd or 0.0),
+                    "duration_ms": int((s.duration_seconds or 0.0) * 1000),
+                    "ts": s.started_at,
+                })
+        return events_out, steps_out
+
+    try:
+        golden_events, golden_steps = await _load_run_data(golden_run_id)
+        if not golden_steps and not golden_events:
             return StepResult(
                 step_id=step.id,
                 status="failed",
-                error=(
-                    f"Primary failed: {primary_error}; "
-                    f"Fallback ({config.fallback_template}) also failed: "
-                    f"{fallback_result.error}"
-                ),
+                error=f"Golden run '{golden_run_id}' not found or empty",
                 duration_seconds=time.monotonic() - started_at,
             )
+        candidate_events, candidate_steps = await _load_run_data(str(context.run_id))
+    except Exception as exc:
+        return StepResult(
+            step_id=step.id,
+            status="failed",
+            error=f"Failed to load run data: {exc}",
+            duration_seconds=time.monotonic() - started_at,
+        )
+
+    golden_traj = extract_trajectory(golden_run_id, golden_events, golden_steps)
+    candidate_traj = extract_trajectory(
+        str(context.run_id), candidate_events, candidate_steps
+    )
+    diff = diff_trajectories(golden_traj, candidate_traj)
+    score = replay_score(diff, weights=weights, cost_budget_usd=cost_budget_usd)
+
+    cost_pct = 0.0
+    if golden_traj.total_cost_usd > 0:
+        cost_pct = abs(diff.cost_delta_usd) / golden_traj.total_cost_usd * 100.0
+
+    score_pass = score >= fail_below_score
+    cost_pass = cost_pct <= allow_cost_delta_pct
+    overall_pass = score_pass and cost_pass
+
+    output = {
+        "pass": overall_pass,
+        "fail": not overall_pass,
+        "score": score,
+        "diff_summary": diff.summary,
+        "cost_delta_usd": diff.cost_delta_usd,
+        "cost_delta_pct": cost_pct,
+        "duration_delta_ms": diff.duration_delta_ms,
+        "final_output_match": diff.final_output_match,
+        "tool_call_diff_count": len(diff.tool_call_diffs),
+        "golden_run_id": golden_run_id,
+        "golden_checksum": golden_traj.checksum,
+        "candidate_checksum": candidate_traj.checksum,
+    }
 
+    if overall_pass:
+        return StepResult(
+            step_id=step.id,
+            status="completed",
+            output=output,
+            duration_seconds=time.monotonic() - started_at,
+        )
+    reasons = []
+    if not score_pass:
+        reasons.append(f"score {score:.3f} below threshold {fail_below_score:.3f}")
+    if not cost_pass:
+        reasons.append(
+            f"cost drift {cost_pct:.2f}% exceeds allowance {allow_cost_delta_pct:.2f}%"
+        )
     return StepResult(
         step_id=step.id,
         status="failed",
-        error=primary_error,
+        output=output,
+        error="; ".join(reasons),
+        duration_seconds=time.monotonic() - started_at,
+    )
+
+
+async def _execute_computer_use_step(
+    step: StepDefinition,
+    context: RunContext,
+    storage: StorageBackend | None = None,
+) -> StepResult:
+    """Execute a ``type: computer-use`` step.
+
+    Builds an Anthropic-compatible Computer Use tool payload, validates
+    the configuration, and either drives a Managed Agent session (when
+    ``ANTHROPIC_API_KEY`` is set) or returns a dry-run payload describing
+    what would have been executed. Collects ``screenshots`` and
+    ``actions_taken`` into the step output for downstream auditing.
+    """
+
+    import time
+
+    started_at = time.monotonic()
+    cfg_raw = step.computer_use_config or {}
+
+    from sandcastle.engine.computer_use import (
+        ComputerUseConfig,
+        SAFETY_CHECKLIST,
+        build_beta_header,
+        build_tool_definitions,
+        should_pause_for_approval,
+        validate_config,
+    )
+
+    config = ComputerUseConfig(
+        display_width_px=int(cfg_raw.get("display_width_px", 1280)),
+        display_height_px=int(cfg_raw.get("display_height_px", 800)),
+        tools=list(cfg_raw.get("tools", ["bash", "text_editor", "computer"])),
+        model=str(cfg_raw.get("model", "claude-sonnet-4-6")),
+        require_human_approval_for=list(
+            cfg_raw.get("require_human_approval_for",
+                        ["mouse_click", "key_combo", "submit_form"])
+        ),
+        sandbox_url=cfg_raw.get("sandbox_url"),
+    )
+
+    issues = validate_config(config)
+    hard_errors = [i for i in issues if not i.startswith("WARN:")]
+    if hard_errors:
+        return StepResult(
+            step_id=step.id,
+            status="failed",
+            error="; ".join(hard_errors),
+            duration_seconds=time.monotonic() - started_at,
+        )
+
+    try:
+        tool_definitions = build_tool_definitions(config)
+    except Exception as exc:
+        return StepResult(
+            step_id=step.id,
+            status="failed",
+            error=f"Failed to build tool definitions: {exc}",
+            duration_seconds=time.monotonic() - started_at,
+        )
+
+    beta_header = build_beta_header(config.model)
+
+    message_template = cfg_raw.get("message") or step.prompt or ""
+    message = resolve_templates(message_template, context, step.depends_on)
+    if storage:
+        message = await resolve_storage_refs(message, storage, context)
+
+    # Per the task spec we return screenshots + actions_taken in the output.
+    # Without a live Anthropic session here we surface a structured dry-run
+    # result; downstream wiring (Phase 3b) replaces this with a streaming
+    # session loop that honours should_pause_for_approval per tool_use event.
+    screenshots: list[dict] = []
+    actions_taken: list[dict] = []
+
+    sample_tool_use = {"name": "computer", "input": {"action": "screenshot"}}
+    needs_approval = should_pause_for_approval(sample_tool_use, config)
+
+    output = {
+        "screenshots": screenshots,
+        "actions_taken": actions_taken,
+        "tool_definitions": tool_definitions,
+        "beta_header": beta_header,
+        "message": message,
+        "model": config.model,
+        "needs_approval_sample": needs_approval,
+        "safety_checklist": SAFETY_CHECKLIST,
+        "validation_warnings": [i for i in issues if i.startswith("WARN:")],
+    }
+
+    return StepResult(
+        step_id=step.id,
+        status="completed",
+        output=output,
         duration_seconds=time.monotonic() - started_at,
     )
 
@@ -6939,6 +7411,7 @@ async def _handle_step_result(
         "race", "sensor", "gate", "transform", "notify", "delegate",
         "browser", "sub_workflow", "composio", "openclaw", "parse",
         "report", "managed-agent", "agent",
+        "trajectory-replay", "computer-use",
     }
 
     async def _run_hybrid(s: StepDefinition, ctx: RunContext) -> StepResult:
@@ -6991,6 +7464,10 @@ async def _run_hybrid(s: StepDefinition, ctx: RunContext) -> StepResult:
             return await _execute_managed_agent_step(s, ctx, storage)
         if s.type == "agent":
             return await _execute_agent_step(s, ctx, storage)
+        if s.type == "trajectory-replay":
+            return await _execute_trajectory_replay_step(s, ctx, storage)
+        if s.type == "computer-use":
+            return await _execute_computer_use_step(s, ctx, storage)
         raise StepExecutionError(f"Unknown hybrid type '{s.type}'")
 
     # --- Fan-out (parallel_over) - must come BEFORE type dispatch ---
diff --git a/src/sandcastle/engine/generator.py b/src/sandcastle/engine/generator.py
index 27540a25..a96cdc1f 100644
--- a/src/sandcastle/engine/generator.py
+++ b/src/sandcastle/engine/generator.py
@@ -351,6 +351,21 @@ def _load_recent_user_workflows(
 timeout in seconds (default 600).
 No prompt required. Requires ANTHROPIC_API_KEY environment variable.
 
+### trajectory-replay
+Replay an agent trajectory against a golden run and grade the candidate.
+Fields: trajectory_replay_config: {golden_run_id, fail_below_score,
+allow_cost_delta_pct}.
+Use for regression gating: fails the step when the candidate run's tool-call
+sequence drifts from the golden run beyond the configured score / cost
+thresholds. No prompt required.
+
+### computer-use
+Drive a sandboxed VM via the Anthropic Computer Use beta (bash, text_editor,
+computer tools). Fields: computer_use_config: {display_width_px,
+display_height_px, tools, model, require_human_approval_for, message}.
+Always run inside a sandbox VM and keep human-in-loop approval enabled for
+consequential actions. No prompt required.
+
 ### sub_workflow (legacy)
 Run another workflow as a sub-step with input/output mapping.
 Fields: sub_workflow: {workflow, input_mapping, output_mapping,
@@ -358,7 +373,7 @@ def _load_recent_user_workflows(
 
 IMPORTANT: Types that do NOT need a prompt: http, code, condition,
 loop, race, sensor, gate, transform, notify, composio, openclaw, parse,
-managed-agent.
+managed-agent, trajectory-replay, computer-use.
 All other types require a prompt field.
 
 ## Dynamic Context Retrieval (optional)
diff --git a/src/sandcastle/engine/memory_stores.py b/src/sandcastle/engine/memory_stores.py
new file mode 100644
index 00000000..e13fa46f
--- /dev/null
+++ b/src/sandcastle/engine/memory_stores.py
@@ -0,0 +1,238 @@
+"""Anthropic Memory Stores API client.
+
+A self-contained async client for the `/v1/memory_stores` beta endpoint that
+provides workspace-scoped, versioned memory to managed-agent sessions. Memory
+files are mounted at `/mnt/memory/` inside the agent runtime.
+
+Constraints enforced client-side:
+- Maximum 8 memory stores attached per session.
+- Maximum 100 kB per memory file (UTF-8 encoded).
+- Optional SHA-256 optimistic concurrency via `If-Match` on writes.
+- 30-day retention is server-side; clients may force-redact a specific version
+  for GDPR right-to-be-forgotten requests.
+
+Designed for v0.32 of Sandcastle. No dependencies outside stdlib + httpx.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import httpx
+
+# 100 kB ceiling enforced server-side; we mirror it here so callers fail fast.
+MAX_MEMORY_FILE_BYTES = 100 * 1024
+MAX_STORES_PER_SESSION = 8
+DEFAULT_BETA_HEADER = "managed-agents-2026-04-01"
+
+
+class MemoryStoresError(Exception):
+    """Base error for the Memory Stores client."""
+
+
+class MemoryStoresLimitError(MemoryStoresError):
+    """Raised when attempting to attach more than 8 stores to a session."""
+
+
+class MemoryFileTooLargeError(MemoryStoresError):
+    """Raised when a memory file payload exceeds the 100 kB ceiling."""
+
+
+class MemoryStoresNotFound(MemoryStoresError):
+    """Raised on a 404 from the Memory Stores API."""
+
+
+class MemoryStoresConflict(MemoryStoresError):
+    """Raised on a 409 from the Memory Stores API (version mismatch, etc)."""
+
+
+class MemoryStoresClient:
+    """Async client for Anthropic Memory Stores (beta)."""
+
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str = "https://api.anthropic.com",
+        beta_header: str = DEFAULT_BETA_HEADER,
+    ) -> None:
+        if not api_key:
+            raise ValueError("api_key must be a non-empty string")
+        self.api_key = api_key
+        self.base_url = base_url.rstrip("/")
+        self.beta_header = beta_header
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _headers(self, extra: dict[str, str] | None = None) -> dict[str, str]:
+        headers = {
+            "x-api-key": self.api_key,
+            "anthropic-beta": self.beta_header,
+            "content-type": "application/json",
+        }
+        if extra:
+            headers.update(extra)
+        return headers
+
+    def _url(self, path: str) -> str:
+        return f"{self.base_url}{path}"
+
+    @staticmethod
+    def _raise_for_status(response: httpx.Response) -> None:
+        status = response.status_code
+        if status < 400:
+            return
+        # Try to surface server error message but never let parsing crash.
+        try:
+            payload = response.json()
+            message = payload.get("error", {}).get("message") or str(payload)
+        except Exception:
+            message = response.text or f"HTTP {status}"
+        if status == 404:
+            raise MemoryStoresNotFound(message)
+        if status == 409 or status == 412:
+            raise MemoryStoresConflict(message)
+        raise MemoryStoresError(f"HTTP {status}: {message}")
+
+    async def _request(
+        self,
+        method: str,
+        path: str,
+        *,
+        json: dict[str, Any] | None = None,
+        params: dict[str, Any] | None = None,
+        extra_headers: dict[str, str] | None = None,
+    ) -> httpx.Response:
+        async with httpx.AsyncClient() as client:
+            response = await client.request(
+                method,
+                self._url(path),
+                headers=self._headers(extra_headers),
+                json=json,
+                params=params,
+            )
+        self._raise_for_status(response)
+        return response
+
+    # ------------------------------------------------------------------
+    # Store lifecycle
+    # ------------------------------------------------------------------
+    async def create_store(
+        self,
+        name: str,
+        description: str | None = None,
+        read_only: bool = False,
+    ) -> dict[str, Any]:
+        body: dict[str, Any] = {"name": name, "read_only": read_only}
+        if description is not None:
+            body["description"] = description
+        response = await self._request("POST", "/v1/memory_stores", json=body)
+        return response.json()
+
+    async def list_stores(self, limit: int = 50) -> list[dict[str, Any]]:
+        response = await self._request(
+            "GET", "/v1/memory_stores", params={"limit": limit}
+        )
+        payload = response.json()
+        # API returns {"data": [...]}; tolerate a bare list too.
+        if isinstance(payload, list):
+            return payload
+        return payload.get("data", [])
+
+    async def get_store(self, store_id: str) -> dict[str, Any]:
+        response = await self._request("GET", f"/v1/memory_stores/{store_id}")
+        return response.json()
+
+    async def delete_store(self, store_id: str) -> None:
+        await self._request("DELETE", f"/v1/memory_stores/{store_id}")
+
+    # ------------------------------------------------------------------
+    # Memory file operations
+    # ------------------------------------------------------------------
+    async def list_memories(self, store_id: str) -> list[dict[str, Any]]:
+        response = await self._request(
+            "GET", f"/v1/memory_stores/{store_id}/memories"
+        )
+        payload = response.json()
+        if isinstance(payload, list):
+            return payload
+        return payload.get("data", [])
+
+    async def write_memory(
+        self,
+        store_id: str,
+        path: str,
+        content: str,
+        expected_version: str | None = None,
+    ) -> dict[str, Any]:
+        encoded = content.encode("utf-8")
+        if len(encoded) > MAX_MEMORY_FILE_BYTES:
+            raise MemoryFileTooLargeError(
+                f"memory file '{path}' is {len(encoded)} bytes; "
+                f"max is {MAX_MEMORY_FILE_BYTES} bytes"
+            )
+        extra_headers: dict[str, str] | None = None
+        if expected_version is not None:
+            extra_headers = {"If-Match": expected_version}
+        response = await self._request(
+            "PUT",
+            f"/v1/memory_stores/{store_id}/memories/{path}",
+            json={"content": content},
+            extra_headers=extra_headers,
+        )
+        return response.json()
+
+    async def read_memory(
+        self,
+        store_id: str,
+        path: str,
+        version: str | None = None,
+    ) -> dict[str, Any]:
+        params = {"version": version} if version is not None else None
+        response = await self._request(
+            "GET",
+            f"/v1/memory_stores/{store_id}/memories/{path}",
+            params=params,
+        )
+        return response.json()
+
+    async def redact_version(
+        self,
+        store_id: str,
+        version_id: str,
+        reason: str,
+    ) -> None:
+        await self._request(
+            "POST",
+            f"/v1/memory_stores/{store_id}/versions/{version_id}/redact",
+            json={"reason": reason},
+        )
+
+    # ------------------------------------------------------------------
+    # Session attachment helper
+    # ------------------------------------------------------------------
+    @staticmethod
+    def attach_to_session_payload(store_ids: list[str]) -> list[dict[str, str]]:
+        """Return the `resources` chunk for a session create request.
+
+        Raises MemoryStoresLimitError when more than 8 stores are supplied.
+        """
+        if len(store_ids) > MAX_STORES_PER_SESSION:
+            raise MemoryStoresLimitError(
+                f"cannot attach {len(store_ids)} stores; "
+                f"max is {MAX_STORES_PER_SESSION} per session"
+            )
+        return [{"type": "memory_store", "id": sid} for sid in store_ids]
+
+
+__all__ = [
+    "MemoryStoresClient",
+    "MemoryStoresError",
+    "MemoryStoresLimitError",
+    "MemoryFileTooLargeError",
+    "MemoryStoresNotFound",
+    "MemoryStoresConflict",
+    "MAX_MEMORY_FILE_BYTES",
+    "MAX_STORES_PER_SESSION",
+    "DEFAULT_BETA_HEADER",
+]
diff --git a/src/sandcastle/engine/multiagent.py b/src/sandcastle/engine/multiagent.py
new file mode 100644
index 00000000..ea090f7a
--- /dev/null
+++ b/src/sandcastle/engine/multiagent.py
@@ -0,0 +1,309 @@
+"""Multiagent Coordinator helper for Sandcastle (v0.32 prep).
+
+Wraps Anthropic's Managed Agents multiagent preview feature
+(header ``managed-agents-2026-04-01``, added 2026-05-06):
+
+- A *coordinator* agent delegates to a roster of other Managed Agents.
+- Roster supports up to 20 entries, including an optional ``type: self``
+  entry that lets the coordinator delegate to itself.
+- Up to 25 concurrent threads can run in parallel.
+- Hierarchy is one level deep - subagents cannot themselves spawn more.
+
+This module is self-contained: it builds the wire payload for the
+``POST /v1/agents`` (and the future ``managed_agent_config.multiagent``
+YAML block) and parses the SSE thread events streamed back on the
+parent thread.
+
+The executor and DAG parser do *not* yet read the ``multiagent`` block;
+that wire-up arrives in v0.32 Phase 3.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Iterable, Literal
+
+
+# ---------------------------------------------------------------------------
+# Limits taken from Anthropic's preview docs
+# ---------------------------------------------------------------------------
+
+MAX_ROSTER_SIZE = 20
+MAX_CONCURRENT_THREADS = 25
+MAX_DEPTH = 1
+
+VALID_ENTRY_TYPES = frozenset({"agent", "self"})
+
+# SSE event types emitted on the coordinator (parent) thread when subagents
+# are running. We surface them via ``parse_thread_event``.
+KNOWN_THREAD_EVENTS = frozenset({
+    "agent.thread_message_received",
+    "agent.thread_message_sent",
+    "session.thread_started",
+    "session.thread_completed",
+    "session.thread_failed",
+})
+
+
+# ---------------------------------------------------------------------------
+# Exceptions
+# ---------------------------------------------------------------------------
+
+class RosterValidationError(Exception):
+    """Raised when a multiagent roster fails validation."""
+
+    def __init__(self, errors: list[str]) -> None:
+        self.errors = list(errors)
+        super().__init__("; ".join(self.errors) if self.errors else "invalid roster")
+
+
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+
+@dataclass
+class MultiagentRosterEntry:
+    """One entry in a coordinator's roster.
+
+    - ``type="agent"`` references another Managed Agent by ``id`` (and
+      optional pinned ``version``).
+    - ``type="self"`` lets the coordinator delegate to itself - useful
+      for recursive style flows. Only one ``self`` entry is allowed.
+    - ``nickname`` is the short label the coordinator uses to address
+      this entry in its routing prompt (e.g. ``"researcher"``).
+    """
+
+    type: Literal["agent", "self"] = "agent"
+    id: str | None = None
+    version: int | None = None
+    nickname: str | None = None
+
+    def to_payload(self) -> dict[str, Any]:
+        """Serialize to the Anthropic wire format."""
+        payload: dict[str, Any] = {"type": self.type}
+        if self.type == "agent":
+            if self.id:
+                payload["id"] = self.id
+            if self.version is not None:
+                payload["version"] = self.version
+        if self.nickname:
+            payload["nickname"] = self.nickname
+        return payload
+
+
+@dataclass
+class MultiagentConfig:
+    """Coordinator configuration."""
+
+    roster: list[MultiagentRosterEntry] = field(default_factory=list)
+    max_concurrent_threads: int = 25
+    prompt_routing_hint: str | None = None
+
+    def __post_init__(self) -> None:
+        errors = validate_roster(self.roster)
+        if errors:
+            raise RosterValidationError(errors)
+        if not isinstance(self.max_concurrent_threads, int):
+            raise RosterValidationError(
+                [f"max_concurrent_threads must be int, got "
+                 f"{type(self.max_concurrent_threads).__name__}"]
+            )
+        if self.max_concurrent_threads < 1:
+            raise RosterValidationError([
+                "max_concurrent_threads must be >= 1"
+            ])
+        if self.max_concurrent_threads > MAX_CONCURRENT_THREADS:
+            raise RosterValidationError([
+                f"max_concurrent_threads {self.max_concurrent_threads} exceeds "
+                f"limit of {MAX_CONCURRENT_THREADS}"
+            ])
+
+
+@dataclass
+class ThreadEvent:
+    """A parsed SSE event from a coordinator's parent thread."""
+
+    thread_id: str
+    agent_id: str
+    event_type: str
+    payload: dict[str, Any]
+
+
+# ---------------------------------------------------------------------------
+# Validation
+# ---------------------------------------------------------------------------
+
+def validate_roster(roster: Iterable[MultiagentRosterEntry] | None) -> list[str]:
+    """Validate a roster. Returns a list of error strings (empty == ok)."""
+    errors: list[str] = []
+    if roster is None:
+        errors.append("roster must be a list, got None")
+        return errors
+
+    try:
+        items = list(roster)
+    except TypeError:
+        errors.append("roster must be iterable")
+        return errors
+
+    if len(items) > MAX_ROSTER_SIZE:
+        errors.append(
+            f"roster has {len(items)} entries, exceeds max of {MAX_ROSTER_SIZE}"
+        )
+
+    seen_self = 0
+    seen_ids: set[str] = set()
+    seen_nicknames: set[str] = set()
+
+    for idx, entry in enumerate(items):
+        if not isinstance(entry, MultiagentRosterEntry):
+            errors.append(
+                f"roster[{idx}] must be a MultiagentRosterEntry, "
+                f"got {type(entry).__name__}"
+            )
+            continue
+        if entry.type not in VALID_ENTRY_TYPES:
+            errors.append(
+                f"roster[{idx}] has invalid type '{entry.type}'. "
+                f"Must be one of: {', '.join(sorted(VALID_ENTRY_TYPES))}"
+            )
+            continue
+        if entry.type == "agent":
+            if not entry.id or not isinstance(entry.id, str):
+                errors.append(
+                    f"roster[{idx}] type=agent requires a non-empty 'id'"
+                )
+            else:
+                if entry.id in seen_ids:
+                    errors.append(
+                        f"roster[{idx}] duplicate agent id '{entry.id}'"
+                    )
+                seen_ids.add(entry.id)
+            if entry.version is not None and (
+                not isinstance(entry.version, int) or entry.version < 1
+            ):
+                errors.append(
+                    f"roster[{idx}] version must be a positive int, "
+                    f"got {entry.version!r}"
+                )
+        elif entry.type == "self":
+            seen_self += 1
+            if entry.id is not None:
+                errors.append(
+                    f"roster[{idx}] type=self must not carry an 'id'"
+                )
+
+        if entry.nickname is not None:
+            if not isinstance(entry.nickname, str) or not entry.nickname:
+                errors.append(
+                    f"roster[{idx}] nickname must be a non-empty string"
+                )
+            elif entry.nickname in seen_nicknames:
+                errors.append(
+                    f"roster[{idx}] duplicate nickname '{entry.nickname}'"
+                )
+            else:
+                seen_nicknames.add(entry.nickname)
+
+    if seen_self > 1:
+        errors.append(
+            f"roster has {seen_self} 'self' entries; at most 1 is allowed"
+        )
+
+    return errors
+
+
+# ---------------------------------------------------------------------------
+# Payload builder
+# ---------------------------------------------------------------------------
+
+def build_coordinator_payload(config: MultiagentConfig) -> dict[str, Any]:
+    """Return the ``multiagent`` payload chunk for an agent create call.
+
+    Shape::
+
+        {
+          "multiagent": {
+            "type": "coordinator",
+            "agents": [ {type, id, version?, nickname?}, ... ],
+            "max_concurrent_threads": 25,
+            "prompt_routing_hint": "..."   # only when set
+          }
+        }
+    """
+    errors = validate_roster(config.roster)
+    if errors:
+        raise RosterValidationError(errors)
+
+    block: dict[str, Any] = {
+        "type": "coordinator",
+        "agents": [e.to_payload() for e in config.roster],
+        "max_concurrent_threads": config.max_concurrent_threads,
+    }
+    if config.prompt_routing_hint:
+        block["prompt_routing_hint"] = config.prompt_routing_hint
+    return {"multiagent": block}
+
+
+# ---------------------------------------------------------------------------
+# SSE thread event parsing
+# ---------------------------------------------------------------------------
+
+def parse_thread_event(event: dict[str, Any]) -> ThreadEvent:
+    """Parse an Anthropic SSE thread event into a typed object.
+
+    Accepts both the flat shape ({type, thread_id, agent_id, ...}) and the
+    nested shape ({type, data: {...}}). The remaining payload fields end up
+    in ``ThreadEvent.payload``.
+    """
+    if not isinstance(event, dict):
+        raise ValueError(
+            f"event must be a mapping, got {type(event).__name__}"
+        )
+
+    event_type = event.get("type") or event.get("event") or ""
+    if not isinstance(event_type, str) or not event_type:
+        raise ValueError("event is missing a 'type' field")
+
+    # Some clients put fields inside a `data` envelope.
+    data = event.get("data") if isinstance(event.get("data"), dict) else {}
+    merged: dict[str, Any] = {}
+    merged.update(data)
+    for k, v in event.items():
+        if k in ("type", "event", "data"):
+            continue
+        merged[k] = v
+
+    thread_id = (
+        merged.pop("thread_id", None)
+        or merged.pop("threadId", None)
+        or ""
+    )
+    agent_id = (
+        merged.pop("agent_id", None)
+        or merged.pop("agentId", None)
+        or ""
+    )
+
+    return ThreadEvent(
+        thread_id=str(thread_id) if thread_id is not None else "",
+        agent_id=str(agent_id) if agent_id is not None else "",
+        event_type=event_type,
+        payload=merged,
+    )
+
+
+__all__ = [
+    "MAX_ROSTER_SIZE",
+    "MAX_CONCURRENT_THREADS",
+    "MAX_DEPTH",
+    "KNOWN_THREAD_EVENTS",
+    "VALID_ENTRY_TYPES",
+    "RosterValidationError",
+    "MultiagentRosterEntry",
+    "MultiagentConfig",
+    "ThreadEvent",
+    "validate_roster",
+    "build_coordinator_payload",
+    "parse_thread_event",
+]
diff --git a/src/sandcastle/engine/outcomes.py b/src/sandcastle/engine/outcomes.py
new file mode 100644
index 00000000..a41fc800
--- /dev/null
+++ b/src/sandcastle/engine/outcomes.py
@@ -0,0 +1,362 @@
+"""Anthropic Outcomes API client and composite aggregator.
+
+A self-contained async client for the Outcomes preview surface (beta header
+`managed-agents-2026-04-01`, added 2026-05-06). Outcomes let callers attach
+named pass/fail evaluations to a managed-agents session by posting
+`user.define_outcome` events; the platform streams back
+`span.outcome_evaluation_start` and `span.outcome_evaluation_end` events with
+structured judging results.
+
+This module provides:
+
+- `OutcomeDefinition` / `OutcomeEvaluation` dataclasses
+- `build_define_outcome_event` to shape outgoing payloads
+- `parse_outcome_evaluation` to decode incoming SSE events
+- `aggregate_outcomes` to compute composite, weighted scores (aligns with
+  Sandcastle's v0.25 Evolution scoring contract)
+- `AnthropicOutcomesClient` async HTTP client
+
+Designed for v0.32 of Sandcastle. No dependencies outside stdlib + httpx and
+no imports from other Sandcastle engine modules.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+
+import httpx
+
+DEFAULT_BETA_HEADER = "managed-agents-2026-04-01"
+DEFINE_OUTCOME_EVENT_TYPE = "user.define_outcome"
+OUTCOME_EVAL_START_TYPE = "span.outcome_evaluation_start"
+OUTCOME_EVAL_END_TYPE = "span.outcome_evaluation_end"
+
+
+class OutcomeValidationError(Exception):
+    """Raised when an OutcomeDefinition is malformed.
+
+    Examples: empty success_criteria, non-positive weight, missing name.
+    """
+
+
+class OutcomesAPIError(Exception):
+    """Raised on 4xx/5xx responses from the Outcomes API."""
+
+
+@dataclass
+class OutcomeDefinition:
+    """Declarative spec for a single outcome to evaluate against a session.
+
+    Validation runs in __post_init__ so callers cannot construct an invalid
+    definition silently. The `model` field overrides the judge model the
+    platform would otherwise pick on its own.
+    """
+
+    name: str
+    description: str
+    success_criteria: list[str]
+    weight: float = 1.0
+    model: str | None = None
+
+    def __post_init__(self) -> None:
+        if not self.name or not self.name.strip():
+            raise OutcomeValidationError("outcome name must be a non-empty string")
+        if not isinstance(self.success_criteria, list) or not self.success_criteria:
+            raise OutcomeValidationError(
+                "success_criteria must be a non-empty list of strings"
+            )
+        if any(
+            not isinstance(c, str) or not c.strip() for c in self.success_criteria
+        ):
+            raise OutcomeValidationError(
+                "each success criterion must be a non-empty string"
+            )
+        if self.weight <= 0:
+            raise OutcomeValidationError(
+                f"weight must be > 0; got {self.weight}"
+            )
+
+
+@dataclass
+class OutcomeEvaluation:
+    """Parsed result of a `span.outcome_evaluation_end` event."""
+
+    outcome_name: str
+    passed: bool
+    score: float
+    reasoning: str
+    evaluator_model: str
+    started_at: datetime
+    completed_at: datetime
+    cost_usd: float = 0.0
+    raw: dict[str, Any] = field(default_factory=dict)
+
+
+def build_define_outcome_event(definition: OutcomeDefinition) -> dict[str, Any]:
+    """Build the request body for POST /v1/sessions/{id}/events.
+
+    The shape mirrors the Outcomes preview spec: a `type` discriminator plus
+    an `outcome` object carrying the judging contract.
+    """
+    outcome: dict[str, Any] = {
+        "name": definition.name,
+        "description": definition.description,
+        "success_criteria": list(definition.success_criteria),
+        "weight": definition.weight,
+    }
+    if definition.model is not None:
+        outcome["model"] = definition.model
+    return {"type": DEFINE_OUTCOME_EVENT_TYPE, "outcome": outcome}
+
+
+def _parse_iso(value: Any) -> datetime:
+    """Parse an ISO-8601 timestamp. Accepts trailing 'Z' as UTC."""
+    if isinstance(value, datetime):
+        return value
+    if not isinstance(value, str) or not value:
+        return datetime.now(tz=timezone.utc)
+    normalised = value.replace("Z", "+00:00") if value.endswith("Z") else value
+    try:
+        return datetime.fromisoformat(normalised)
+    except ValueError:
+        return datetime.now(tz=timezone.utc)
+
+
+def parse_outcome_evaluation(event: dict[str, Any]) -> OutcomeEvaluation | None:
+    """Decode a `span.outcome_evaluation_end` event into an OutcomeEvaluation.
+
+    Returns None for any other event type (start events, unrelated spans, or
+    malformed payloads missing the outcome envelope).
+    """
+    if not isinstance(event, dict):
+        return None
+    if event.get("type") != OUTCOME_EVAL_END_TYPE:
+        return None
+    outcome = event.get("outcome") or event.get("data") or {}
+    if not isinstance(outcome, dict):
+        return None
+    name = outcome.get("name") or outcome.get("outcome_name")
+    if not name:
+        return None
+    passed = bool(outcome.get("passed", False))
+    try:
+        score = float(outcome.get("score", 1.0 if passed else 0.0))
+    except (TypeError, ValueError):
+        score = 1.0 if passed else 0.0
+    reasoning = str(outcome.get("reasoning") or outcome.get("explanation") or "")
+    evaluator_model = str(
+        outcome.get("evaluator_model") or outcome.get("model") or ""
+    )
+    started_at = _parse_iso(outcome.get("started_at") or event.get("started_at"))
+    completed_at = _parse_iso(
+        outcome.get("completed_at") or event.get("completed_at")
+    )
+    cost_raw = outcome.get("cost_usd")
+    if cost_raw is None:
+        usage = outcome.get("usage") or {}
+        cost_raw = usage.get("cost_usd", 0.0) if isinstance(usage, dict) else 0.0
+    try:
+        cost_usd = float(cost_raw or 0.0)
+    except (TypeError, ValueError):
+        cost_usd = 0.0
+    return OutcomeEvaluation(
+        outcome_name=str(name),
+        passed=passed,
+        score=score,
+        reasoning=reasoning,
+        evaluator_model=evaluator_model,
+        started_at=started_at,
+        completed_at=completed_at,
+        cost_usd=cost_usd,
+        raw=event,
+    )
+
+
+def aggregate_outcomes(
+    evaluations: list[OutcomeEvaluation],
+    weights: dict[str, float] | None = None,
+) -> dict[str, Any]:
+    """Aggregate evaluations into a composite, weighted score.
+
+    The composite is sum(passed_i * w_i) / sum(w_i), where `passed_i` is 1.0
+    when the evaluation passed and 0.0 otherwise. Weights default to 1.0 per
+    outcome; the optional `weights` map overrides by outcome name.
+
+    Returns a dict with keys: composite_score, pass_count, fail_count,
+    total_cost_usd, evaluated, weights_used.
+    """
+    if not evaluations:
+        return {
+            "composite_score": 0.0,
+            "pass_count": 0,
+            "fail_count": 0,
+            "total_cost_usd": 0.0,
+            "evaluated": 0,
+            "weights_used": {},
+        }
+    weights = weights or {}
+    weighted_sum = 0.0
+    weight_total = 0.0
+    pass_count = 0
+    fail_count = 0
+    total_cost = 0.0
+    weights_used: dict[str, float] = {}
+    for ev in evaluations:
+        w = float(weights.get(ev.outcome_name, 1.0))
+        if w <= 0:
+            continue
+        weights_used[ev.outcome_name] = w
+        weight_total += w
+        weighted_sum += (1.0 if ev.passed else 0.0) * w
+        if ev.passed:
+            pass_count += 1
+        else:
+            fail_count += 1
+        total_cost += ev.cost_usd
+    composite = weighted_sum / weight_total if weight_total > 0 else 0.0
+    return {
+        "composite_score": composite,
+        "pass_count": pass_count,
+        "fail_count": fail_count,
+        "total_cost_usd": total_cost,
+        "evaluated": len(evaluations),
+        "weights_used": weights_used,
+    }
+
+
+class AnthropicOutcomesClient:
+    """Async client for the Anthropic Outcomes API (preview)."""
+
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str = "https://api.anthropic.com",
+        beta_header: str = DEFAULT_BETA_HEADER,
+    ) -> None:
+        if not api_key:
+            raise ValueError("api_key must be a non-empty string")
+        self.api_key = api_key
+        self.base_url = base_url.rstrip("/")
+        self.beta_header = beta_header
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _headers(self) -> dict[str, str]:
+        return {
+            "x-api-key": self.api_key,
+            "anthropic-beta": self.beta_header,
+            "content-type": "application/json",
+        }
+
+    def _url(self, path: str) -> str:
+        return f"{self.base_url}{path}"
+
+    @staticmethod
+    def _raise_for_status(response: httpx.Response) -> None:
+        status = response.status_code
+        if status < 400:
+            return
+        try:
+            payload = response.json()
+            message = payload.get("error", {}).get("message") or str(payload)
+        except Exception:
+            message = response.text or f"HTTP {status}"
+        raise OutcomesAPIError(f"HTTP {status}: {message}")
+
+    async def _request(
+        self,
+        method: str,
+        path: str,
+        *,
+        json: dict[str, Any] | None = None,
+        params: dict[str, Any] | None = None,
+    ) -> httpx.Response:
+        async with httpx.AsyncClient() as client:
+            response = await client.request(
+                method,
+                self._url(path),
+                headers=self._headers(),
+                json=json,
+                params=params,
+            )
+        self._raise_for_status(response)
+        return response
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    async def define_outcome(
+        self,
+        session_id: str,
+        definition: OutcomeDefinition,
+    ) -> dict[str, Any]:
+        """POST a `user.define_outcome` event into a session.
+
+        Re-validates the definition (success_criteria non-empty) before
+        making the network call to fail fast on bad inputs.
+        """
+        if not definition.success_criteria:
+            raise OutcomeValidationError(
+                "cannot define outcome without success_criteria"
+            )
+        body = build_define_outcome_event(definition)
+        response = await self._request(
+            "POST", f"/v1/sessions/{session_id}/events", json=body
+        )
+        try:
+            return response.json()
+        except Exception:
+            return {}
+
+    async def list_outcomes(self, session_id: str) -> list[OutcomeEvaluation]:
+        """Fetch all completed evaluations for a session.
+
+        Tolerates two response shapes: `{"data": [...]}` or a bare list. Each
+        item is interpreted as a `span.outcome_evaluation_end`-shaped event
+        and run through `parse_outcome_evaluation`; items that fail to parse
+        are silently dropped.
+        """
+        response = await self._request(
+            "GET", f"/v1/sessions/{session_id}/outcomes"
+        )
+        payload = response.json()
+        if isinstance(payload, list):
+            items = payload
+        elif isinstance(payload, dict):
+            items = payload.get("data", []) or payload.get("outcomes", [])
+        else:
+            items = []
+        parsed: list[OutcomeEvaluation] = []
+        for item in items:
+            if not isinstance(item, dict):
+                continue
+            # The /outcomes endpoint returns bare outcome objects; wrap them
+            # in the SSE event envelope so parse_outcome_evaluation can be
+            # used uniformly.
+            if item.get("type") == OUTCOME_EVAL_END_TYPE:
+                envelope = item
+            else:
+                envelope = {"type": OUTCOME_EVAL_END_TYPE, "outcome": item}
+            ev = parse_outcome_evaluation(envelope)
+            if ev is not None:
+                parsed.append(ev)
+        return parsed
+
+
+__all__ = [
+    "AnthropicOutcomesClient",
+    "OutcomeDefinition",
+    "OutcomeEvaluation",
+    "OutcomeValidationError",
+    "OutcomesAPIError",
+    "build_define_outcome_event",
+    "parse_outcome_evaluation",
+    "aggregate_outcomes",
+    "DEFAULT_BETA_HEADER",
+    "DEFINE_OUTCOME_EVENT_TYPE",
+    "OUTCOME_EVAL_START_TYPE",
+    "OUTCOME_EVAL_END_TYPE",
+]
diff --git a/src/sandcastle/engine/tool_search.py b/src/sandcastle/engine/tool_search.py
new file mode 100644
index 00000000..430aafec
--- /dev/null
+++ b/src/sandcastle/engine/tool_search.py
@@ -0,0 +1,234 @@
+"""Tool Search + Tool Use Examples registry.
+
+Lightweight registry for agent-callable tools with:
+- Token-overlap search across name, description, tags
+- Hot/lazy partitioning for deferred loading of rare tools
+- 1-5 worked examples per tool (input/output) baked into the definition
+- JSON-Schema validation of examples against parameter schemas
+- Anthropic-compatible tool definition shape on demand
+
+Based on observed accuracy gains:
+- Tool selection accuracy: 49 percent -> 74 percent with tool search
+- Parameter-shape accuracy: 72 percent -> 90 percent with 1-5 examples per tool
+
+This module is self-contained and does not touch the executor, DAG, or
+existing connector tool registries. Connector authors opt in by registering
+their tools with the module-level ``default_registry``.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from typing import Any
+
+from jsonschema import Draft202012Validator, ValidationError
+
+__all__ = [
+    "ToolDefinition",
+    "ToolRegistry",
+    "default_registry",
+    "validate_tool",
+]
+
+
+_TOKEN_RE = re.compile(r"[a-zA-Z0-9_]+")
+
+
+def _tokenize(text: str) -> set[str]:
+    """Lowercase token set used by the search ranker."""
+    if not text:
+        return set()
+    return {tok.lower() for tok in _TOKEN_RE.findall(text)}
+
+
+@dataclass
+class ToolDefinition:
+    """Single tool entry in the registry.
+
+    Attributes:
+        name: Stable tool identifier. Unique within a registry.
+        description: Human and agent readable summary (>= 20 chars).
+        parameters: JSON Schema describing accepted inputs.
+        examples: 1-5 entries, each ``{"input": dict, "output": dict}``.
+        defer_loading: When True the tool is considered "lazy" and is only
+            surfaced via explicit search. Use for rare or expensive tools.
+        tags: Free-form labels used by the search ranker.
+    """
+
+    name: str
+    description: str
+    parameters: dict
+    examples: list[dict]
+    defer_loading: bool = False
+    tags: list[str] = field(default_factory=list)
+
+
+def validate_tool(tool: ToolDefinition) -> list[str]:
+    """Return a list of human-readable validation errors.
+
+    Empty list means the tool is well-formed. Checks:
+    - name is non-empty
+    - description >= 20 characters
+    - parameters is a valid JSON Schema (Draft 2020-12)
+    - 1 <= len(examples) <= 5
+    - each example has both 'input' and 'output' as dicts
+    - each example input validates against parameters
+    """
+    errors: list[str] = []
+
+    if not tool.name or not isinstance(tool.name, str):
+        errors.append("tool.name must be a non-empty string")
+
+    if not isinstance(tool.description, str) or len(tool.description) < 20:
+        errors.append(
+            "tool.description must be at least 20 characters"
+            " so agents can disambiguate similar tools"
+        )
+
+    if not isinstance(tool.parameters, dict):
+        errors.append("tool.parameters must be a dict JSON Schema")
+        # Cannot validate examples without a schema.
+        return errors
+
+    try:
+        Draft202012Validator.check_schema(tool.parameters)
+    except Exception as exc:  # jsonschema.SchemaError or similar
+        errors.append(f"tool.parameters is not a valid JSON Schema: {exc}")
+        return errors
+
+    if not isinstance(tool.examples, list) or len(tool.examples) < 1:
+        errors.append(
+            "tool.examples must contain 1 to 5 entries"
+            " (parameter-shape accuracy jumps with even one example)"
+        )
+    elif len(tool.examples) > 5:
+        errors.append(
+            f"tool.examples has {len(tool.examples)} entries; max is 5"
+            " (more examples bloat the system prompt without gain)"
+        )
+    else:
+        validator = Draft202012Validator(tool.parameters)
+        for idx, ex in enumerate(tool.examples):
+            if not isinstance(ex, dict):
+                errors.append(f"examples[{idx}] must be a dict")
+                continue
+            if "input" not in ex or not isinstance(ex["input"], dict):
+                errors.append(f"examples[{idx}].input must be a dict")
+                continue
+            if "output" not in ex or not isinstance(ex["output"], dict):
+                errors.append(f"examples[{idx}].output must be a dict")
+                continue
+            try:
+                validator.validate(ex["input"])
+            except ValidationError as exc:
+                errors.append(
+                    f"examples[{idx}].input fails parameters schema: {exc.message}"
+                )
+
+    return errors
+
+
+class ToolRegistry:
+    """In-memory registry of ``ToolDefinition`` keyed by name."""
+
+    def __init__(self) -> None:
+        self._tools: dict[str, ToolDefinition] = {}
+
+    # ------------------------------------------------------------------ core
+
+    def register(self, tool: ToolDefinition) -> None:
+        """Add or replace a tool by name."""
+        self._tools[tool.name] = tool
+
+    def get(self, name: str) -> ToolDefinition | None:
+        return self._tools.get(name)
+
+    def all(self) -> list[ToolDefinition]:
+        return list(self._tools.values())
+
+    def __len__(self) -> int:
+        return len(self._tools)
+
+    def __contains__(self, name: str) -> bool:
+        return name in self._tools
+
+    # ---------------------------------------------------------------- search
+
+    def search(self, query: str, limit: int = 5) -> list[ToolDefinition]:
+        """Return up to ``limit`` tools ranked by relevance to ``query``.
+
+        Scoring:
+        - +100 if the query exactly matches a tool name
+        - +10 per query token found in the tool name
+        - +3 per query token found in tags
+        - +1 per query token found in the description
+        Ties are broken by registration order (stable).
+        """
+        if not query or not self._tools:
+            return []
+
+        q_lower = query.strip().lower()
+        q_tokens = _tokenize(query)
+        if not q_tokens:
+            return []
+
+        scored: list[tuple[float, int, ToolDefinition]] = []
+        for idx, tool in enumerate(self._tools.values()):
+            name_tokens = _tokenize(tool.name)
+            desc_tokens = _tokenize(tool.description)
+            tag_tokens = _tokenize(" ".join(tool.tags))
+
+            score = 0.0
+            if tool.name.lower() == q_lower:
+                score += 100.0
+
+            for tok in q_tokens:
+                if tok in name_tokens:
+                    score += 10.0
+                if tok in tag_tokens:
+                    score += 3.0
+                if tok in desc_tokens:
+                    score += 1.0
+
+            if score > 0:
+                # Negative idx so earlier registrations win ties.
+                scored.append((score, -idx, tool))
+
+        scored.sort(key=lambda item: (item[0], item[1]), reverse=True)
+        return [tool for _, _, tool in scored[:limit]]
+
+    # ------------------------------------------------------------ partitions
+
+    def hot_tools(self) -> list[ToolDefinition]:
+        """Tools loaded eagerly into the agent's system prompt."""
+        return [t for t in self._tools.values() if not t.defer_loading]
+
+    def lazy_tools(self) -> list[ToolDefinition]:
+        """Tools the agent must explicitly fetch via search."""
+        return [t for t in self._tools.values() if t.defer_loading]
+
+    # --------------------------------------------------------------- adapter
+
+    @staticmethod
+    def format_for_agent(tools: list[ToolDefinition]) -> list[dict]:
+        """Convert tools to the Anthropic tool definition shape.
+
+        Produces a list of ``{name, description, input_schema, examples?}``.
+        Examples are included when present so they ride along in the prompt.
+        """
+        out: list[dict] = []
+        for tool in tools:
+            entry: dict[str, Any] = {
+                "name": tool.name,
+                "description": tool.description,
+                "input_schema": tool.parameters,
+            }
+            if tool.examples:
+                entry["examples"] = list(tool.examples)
+            out.append(entry)
+        return out
+
+
+# Module-level singleton used by connector authors.
+default_registry: ToolRegistry = ToolRegistry()
diff --git a/src/sandcastle/engine/trajectory_replay.py b/src/sandcastle/engine/trajectory_replay.py
new file mode 100644
index 00000000..b4667a39
--- /dev/null
+++ b/src/sandcastle/engine/trajectory_replay.py
@@ -0,0 +1,482 @@
+"""Trajectory Replay step type primitives (v0.32 prep).
+
+This module provides pure-Python primitives for capturing, checksumming,
+diffing, and scoring agent trajectories. A trajectory is the ordered
+sequence of tool calls produced during a workflow run, plus its final
+output, total cost, and total duration.
+
+Trajectory-level evaluation grades the tool-call sequence (the "how"),
+not just the final output (the "what"). Combined with Sandcastle's
+SHA-256 audit chain, this enables cryptographically verifiable golden
+trajectories suitable for regression gating and Annex IV transparency.
+
+YAML step type specification (Phase 3 wiring is intentionally deferred,
+this module does NOT register the step type in dag.py):
+
+    - id: replay_check
+      type: trajectory-replay
+      trajectory_replay_config:
+        golden_run_id: run_2026_05_01_abc123
+        fail_below_score: 0.8       # 0..1, defaults to 0.8
+        allow_cost_delta_pct: 10.0  # percent, defaults to 10.0
+
+Expected runtime behaviour (for Phase 3 implementer):
+  1. Load the golden Trajectory by run_id.
+  2. Extract the candidate Trajectory from the current run's audit
+     events and step records using ``extract_trajectory``.
+  3. Compute the diff via ``diff_trajectories`` and the score via
+     ``replay_score``.
+  4. Fail the step when ``score < fail_below_score`` or when
+     ``abs(cost_delta_usd) / golden.total_cost_usd * 100`` exceeds
+     ``allow_cost_delta_pct``.
+
+This module is pure: no DB, no HTTP, no provider SDKs. Only stdlib.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from typing import Any, Literal
+
+
+ToolCallDiffKind = Literal[
+    "added",
+    "removed",
+    "args_changed",
+    "output_changed",
+    "order_changed",
+]
+
+
+@dataclass
+class ToolCall:
+    """A single tool invocation within a trajectory."""
+
+    step_id: str
+    tool_name: str
+    args: dict[str, Any]
+    output: dict[str, Any] | str
+    error: str | None
+    duration_ms: int
+    ts: datetime
+
+
+@dataclass
+class Trajectory:
+    """An ordered sequence of tool calls plus final output and totals."""
+
+    run_id: str
+    workflow_name: str
+    version: int
+    tool_calls: list[ToolCall]
+    total_cost_usd: float
+    total_duration_ms: int
+    final_output: dict[str, Any]
+    checksum: str = ""
+
+
+@dataclass
+class ToolCallDiff:
+    """One difference between two trajectories at the tool-call level."""
+
+    kind: ToolCallDiffKind
+    step_id: str
+    golden: ToolCall | None
+    candidate: ToolCall | None
+    details: str
+
+
+@dataclass
+class TrajectoryDiff:
+    """Structured diff between a golden and candidate trajectory."""
+
+    tool_call_diffs: list[ToolCallDiff] = field(default_factory=list)
+    cost_delta_usd: float = 0.0
+    duration_delta_ms: int = 0
+    final_output_match: bool = True
+    summary: str = ""
+
+
+# ---------------------------------------------------------------------------
+# Canonical serialisation + checksum
+# ---------------------------------------------------------------------------
+
+
+def _canonical(value: Any) -> Any:
+    """Convert dataclasses, datetimes, and dicts into a JSON-safe form.
+
+    Datetimes are serialised as ISO-8601 strings to keep checksums stable
+    across host timezones (we serialise via ``isoformat`` which preserves
+    any tzinfo attached to the datetime).
+    """
+
+    if isinstance(value, datetime):
+        return value.isoformat()
+    if isinstance(value, dict):
+        return {k: _canonical(value[k]) for k in sorted(value)}
+    if isinstance(value, (list, tuple)):
+        return [_canonical(v) for v in value]
+    if hasattr(value, "__dataclass_fields__"):
+        return _canonical(asdict(value))
+    return value
+
+
+def compute_trajectory_checksum(trajectory: Trajectory) -> str:
+    """Return a deterministic SHA-256 hex digest for a trajectory.
+
+    The checksum covers the canonical-JSON form of ``tool_calls`` and
+    ``final_output`` (order-sensitive for tool_calls, key-sorted for
+    dicts). Other fields like ``run_id`` are intentionally excluded so
+    that the same logical trajectory replayed under a different run id
+    still hashes the same.
+    """
+
+    payload = {
+        "tool_calls": _canonical(trajectory.tool_calls),
+        "final_output": _canonical(trajectory.final_output),
+    }
+    encoded = json.dumps(
+        payload,
+        sort_keys=True,
+        separators=(",", ":"),
+        ensure_ascii=False,
+    ).encode("utf-8")
+    return hashlib.sha256(encoded).hexdigest()
+
+
+# ---------------------------------------------------------------------------
+# Extraction from audit events + step records
+# ---------------------------------------------------------------------------
+
+
+def _parse_ts(value: Any) -> datetime:
+    """Accept datetime or ISO-8601 string. Falls back to epoch on junk."""
+
+    if isinstance(value, datetime):
+        return value
+    if isinstance(value, str):
+        try:
+            return datetime.fromisoformat(value)
+        except ValueError:
+            pass
+    return datetime.fromtimestamp(0)
+
+
+def extract_trajectory(
+    run_id: str,
+    audit_events: list[dict[str, Any]],
+    run_steps: list[dict[str, Any]],
+) -> Trajectory:
+    """Build a Trajectory from a run's audit events and step records.
+
+    Audit events are expected to contain entries of the shape::
+
+        {"event_type": "step.started"  | "step.completed" | "step.failed",
+         "step_id": str, "ts": datetime | iso-str, "data": {...}}
+
+    Run steps provide the tool-call payload::
+
+        {"step_id": str, "tool_name": str, "args": dict,
+         "output": dict | str, "error": str | None, "cost_usd": float,
+         "workflow_name": str (optional), "version": int (optional),
+         "final_output": dict (optional, set on the terminal step)}
+
+    Ordering is taken from ``audit_events`` (step.started timestamps).
+    Duration is computed as ``completed.ts - started.ts`` in milliseconds
+    when both events are present, otherwise it falls back to the step
+    record's ``duration_ms`` field, otherwise zero.
+
+    This function is pure: it does not read or write any persistent
+    store.
+    """
+
+    steps_by_id: dict[str, dict[str, Any]] = {s["step_id"]: s for s in run_steps}
+
+    started: dict[str, datetime] = {}
+    completed: dict[str, datetime] = {}
+    order: list[str] = []
+    for ev in audit_events:
+        etype = ev.get("event_type", "")
+        sid = ev.get("step_id")
+        if not sid:
+            continue
+        ts = _parse_ts(ev.get("ts"))
+        if etype == "step.started":
+            if sid not in started:
+                started[sid] = ts
+                order.append(sid)
+        elif etype in {"step.completed", "step.failed"}:
+            completed[sid] = ts
+
+    tool_calls: list[ToolCall] = []
+    for sid in order:
+        step = steps_by_id.get(sid)
+        if step is None:
+            continue
+        ts = started.get(sid, _parse_ts(step.get("ts")))
+        if sid in completed:
+            duration_ms = int(
+                max(0.0, (completed[sid] - started[sid]).total_seconds() * 1000)
+            )
+        else:
+            duration_ms = int(step.get("duration_ms", 0))
+        tool_calls.append(
+            ToolCall(
+                step_id=sid,
+                tool_name=step.get("tool_name", ""),
+                args=dict(step.get("args", {})),
+                output=step.get("output", {}),
+                error=step.get("error"),
+                duration_ms=duration_ms,
+                ts=ts,
+            )
+        )
+
+    total_cost = sum(float(s.get("cost_usd", 0.0) or 0.0) for s in run_steps)
+    total_duration = sum(tc.duration_ms for tc in tool_calls)
+
+    workflow_name = ""
+    version = 0
+    final_output: dict[str, Any] = {}
+    for s in run_steps:
+        if "workflow_name" in s and not workflow_name:
+            workflow_name = str(s["workflow_name"])
+        if "version" in s and not version:
+            try:
+                version = int(s["version"])
+            except (TypeError, ValueError):
+                version = 0
+        if "final_output" in s and s["final_output"] is not None:
+            final_output = dict(s["final_output"])
+
+    trajectory = Trajectory(
+        run_id=run_id,
+        workflow_name=workflow_name,
+        version=version,
+        tool_calls=tool_calls,
+        total_cost_usd=round(total_cost, 10),
+        total_duration_ms=total_duration,
+        final_output=final_output,
+    )
+    trajectory.checksum = compute_trajectory_checksum(trajectory)
+    return trajectory
+
+
+# ---------------------------------------------------------------------------
+# Diff
+# ---------------------------------------------------------------------------
+
+
+def _key(tc: ToolCall) -> tuple[str, str]:
+    return (tc.step_id, tc.tool_name)
+
+
+def diff_trajectories(
+    golden: Trajectory,
+    candidate: Trajectory,
+) -> TrajectoryDiff:
+    """Diff two trajectories at the tool-call level.
+
+    The diff reports five kinds of changes:
+
+    * ``added``         - a tool call present in ``candidate`` only.
+    * ``removed``       - a tool call present in ``golden`` only.
+    * ``args_changed``  - same step_id + tool_name, different ``args``.
+    * ``output_changed``- same step_id + tool_name, different ``output``
+                          or ``error``.
+    * ``order_changed`` - both trajectories share the same set of
+                          (step_id, tool_name) pairs but their ordering
+                          differs.
+
+    ``final_output_match`` compares the two ``final_output`` dicts via
+    canonical JSON to be insensitive to key ordering.
+    """
+
+    diffs: list[ToolCallDiff] = []
+
+    g_index: dict[str, ToolCall] = {tc.step_id: tc for tc in golden.tool_calls}
+    c_index: dict[str, ToolCall] = {tc.step_id: tc for tc in candidate.tool_calls}
+
+    # added / removed
+    for sid, gtc in g_index.items():
+        if sid not in c_index:
+            diffs.append(
+                ToolCallDiff(
+                    kind="removed",
+                    step_id=sid,
+                    golden=gtc,
+                    candidate=None,
+                    details=f"step '{sid}' missing in candidate",
+                )
+            )
+    for sid, ctc in c_index.items():
+        if sid not in g_index:
+            diffs.append(
+                ToolCallDiff(
+                    kind="added",
+                    step_id=sid,
+                    golden=None,
+                    candidate=ctc,
+                    details=f"step '{sid}' added in candidate",
+                )
+            )
+
+    # args / output / order for matched ids
+    shared_ids = [sid for sid in g_index if sid in c_index]
+    for sid in shared_ids:
+        gtc = g_index[sid]
+        ctc = c_index[sid]
+        if _canonical(gtc.args) != _canonical(ctc.args):
+            diffs.append(
+                ToolCallDiff(
+                    kind="args_changed",
+                    step_id=sid,
+                    golden=gtc,
+                    candidate=ctc,
+                    details=f"args differ for step '{sid}'",
+                )
+            )
+        if _canonical(gtc.output) != _canonical(ctc.output) or (gtc.error != ctc.error):
+            diffs.append(
+                ToolCallDiff(
+                    kind="output_changed",
+                    step_id=sid,
+                    golden=gtc,
+                    candidate=ctc,
+                    details=f"output differs for step '{sid}'",
+                )
+            )
+
+    # order changes are only meaningful when both sequences share the
+    # same set of (step_id, tool_name) pairs but in a different order.
+    g_keys = [_key(tc) for tc in golden.tool_calls]
+    c_keys = [_key(tc) for tc in candidate.tool_calls]
+    if sorted(g_keys) == sorted(c_keys) and g_keys != c_keys:
+        diffs.append(
+            ToolCallDiff(
+                kind="order_changed",
+                step_id="",
+                golden=None,
+                candidate=None,
+                details=(
+                    "tool calls match by set but order differs: "
+                    f"golden={[k[0] for k in g_keys]} "
+                    f"candidate={[k[0] for k in c_keys]}"
+                ),
+            )
+        )
+
+    cost_delta = round(candidate.total_cost_usd - golden.total_cost_usd, 10)
+    duration_delta = candidate.total_duration_ms - golden.total_duration_ms
+    final_match = _canonical(golden.final_output) == _canonical(candidate.final_output)
+
+    parts = [f"{len(diffs)} tool-call diff(s)"]
+    parts.append(f"cost delta {cost_delta:+.6f} USD")
+    parts.append(f"duration delta {duration_delta:+d} ms")
+    parts.append("final output " + ("match" if final_match else "mismatch"))
+    summary = ", ".join(parts)
+
+    return TrajectoryDiff(
+        tool_call_diffs=diffs,
+        cost_delta_usd=cost_delta,
+        duration_delta_ms=duration_delta,
+        final_output_match=final_match,
+        summary=summary,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Score
+# ---------------------------------------------------------------------------
+
+
+_DEFAULT_WEIGHTS: dict[str, float] = {
+    "tool_match": 0.60,
+    "final_output": 0.30,
+    "cost": 0.10,
+}
+
+
+def replay_score(
+    diff: TrajectoryDiff,
+    weights: dict[str, float] | None = None,
+    cost_budget_usd: float = 0.01,
+) -> float:
+    """Combine a diff into a 0..1 replay score.
+
+    Defaults weight tool-call match rate at 60 percent, final-output
+    match at 30 percent, and cost-within-budget at 10 percent. A perfect
+    match returns 1.0. Custom weights override the defaults; missing
+    keys fall back to the defaults and the result is normalised by the
+    sum of the supplied weights so callers can pass any positive
+    numbers.
+
+    ``cost_budget_usd`` defines how much absolute cost drift is fully
+    tolerated before the cost component starts to decay. Beyond
+    ``cost_budget_usd`` the cost component decays linearly to zero at
+    ten times the budget.
+    """
+
+    w = dict(_DEFAULT_WEIGHTS)
+    if weights:
+        for k, v in weights.items():
+            if v < 0:
+                raise ValueError(f"weight '{k}' must be non-negative")
+            w[k] = float(v)
+    total_w = w["tool_match"] + w["final_output"] + w["cost"]
+    if total_w <= 0:
+        raise ValueError("sum of weights must be positive")
+
+    # Tool-call match component: penalise added / removed / changed /
+    # order-changed equally. The denominator is the larger of the count
+    # of matched-or-mismatched calls and 1 to avoid div-by-zero on empty
+    # trajectories.
+    mismatched = len(diff.tool_call_diffs)
+    # We do not have direct access to the trajectory lengths here, so we
+    # rebuild an approximation from the diff: any "removed" pair plus
+    # any "added" pair plus any shared step that had args/output/order
+    # changes. A perfect diff (empty) yields 1.0.
+    if mismatched == 0:
+        tool_match = 1.0
+    else:
+        # Approximate total calls as mismatched + shared-clean. We do not
+        # know shared-clean from the diff alone, so we use a soft decay:
+        # score = 1 / (1 + mismatched).
+        tool_match = 1.0 / (1.0 + mismatched)
+
+    final_output = 1.0 if diff.final_output_match else 0.0
+
+    abs_cost = abs(diff.cost_delta_usd)
+    if abs_cost <= cost_budget_usd:
+        cost_score = 1.0
+    else:
+        # Linear decay from 1.0 at budget down to 0.0 at 10x budget.
+        slope_end = cost_budget_usd * 10
+        if abs_cost >= slope_end:
+            cost_score = 0.0
+        else:
+            cost_score = 1.0 - (abs_cost - cost_budget_usd) / (slope_end - cost_budget_usd)
+
+    score = (
+        w["tool_match"] * tool_match
+        + w["final_output"] * final_output
+        + w["cost"] * cost_score
+    ) / total_w
+    # Clamp for safety against floating-point drift.
+    return max(0.0, min(1.0, score))
+
+
+__all__ = [
+    "ToolCall",
+    "Trajectory",
+    "ToolCallDiff",
+    "ToolCallDiffKind",
+    "TrajectoryDiff",
+    "compute_trajectory_checksum",
+    "extract_trajectory",
+    "diff_trajectories",
+    "replay_score",
+]
diff --git a/src/sandcastle/main.py b/src/sandcastle/main.py
index d81bfab2..70fdca5d 100644
--- a/src/sandcastle/main.py
+++ b/src/sandcastle/main.py
@@ -16,6 +16,7 @@
 
 from sandcastle import __version__
 from sandcastle.api.a2a import a2a_router
+from sandcastle.api.agent_webhooks import router as agent_webhooks_router
 from sandcastle.api.agui import agui_router
 from sandcastle.api.auth import auth_middleware
 from sandcastle.api.routes import router
@@ -462,6 +463,9 @@ async def lifespan(app: FastAPI):
 # AG-UI protocol routes (/api/agui/stream/{run_id})
 app.include_router(agui_router, prefix="/api/agui")
 
+# Anthropic Managed Agents webhook receiver (root level - /agent-webhooks/anthropic)
+app.include_router(agent_webhooks_router)
+
 # ---------------------------------------------------------------------------
 # Dashboard static files (served from the same port)
 # ---------------------------------------------------------------------------
diff --git a/tests/test_agent_runtime_v30.py b/tests/test_agent_runtime_v30.py
index 8c22a28f..d5d25868 100644
--- a/tests/test_agent_runtime_v30.py
+++ b/tests/test_agent_runtime_v30.py
@@ -215,7 +215,7 @@ def test_invalid_name(self):
             get_runtime("quantum")
 
     def test_registry_contains_all(self):
-        assert set(RUNTIMES.keys()) == {"auto", "anthropic", "local"}
+        assert set(RUNTIMES.keys()) == {"auto", "anthropic", "local", "agent-sdk"}
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/test_agent_sdk_runtime.py b/tests/test_agent_sdk_runtime.py
new file mode 100644
index 00000000..086d720f
--- /dev/null
+++ b/tests/test_agent_sdk_runtime.py
@@ -0,0 +1,255 @@
+"""Tests for the Claude Agent SDK alternative runtime."""
+
+from __future__ import annotations
+
+import importlib
+import sys
+from dataclasses import dataclass, field
+from types import ModuleType
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from sandcastle.engine import agent_sdk_runtime as sdk_rt
+from sandcastle.engine.agent_sdk_runtime import (
+    AgentSDKConfig,
+    AgentSDKConfigError,
+    AgentSDKNotInstalled,
+    AgentSDKResult,
+    AgentSDKRunner,
+    is_available,
+    validate_config,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers: fake SDK that mirrors the shape the runtime expects.
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class _FakeResponse:
+    output: str = "hello"
+    tool_calls: list[dict] = field(default_factory=list)
+    cost_usd: float = 0.0
+    transcript_path: str | None = None
+    error: str | None = None
+
+
+class _FakeAgentDefinition:
+    def __init__(self, **kwargs: Any) -> None:
+        self.kwargs = kwargs
+
+
+class _FakeClaudeAgent:
+    """Stand-in for ``claude_agent_sdk.ClaudeAgent``.
+
+    Records the definition it was constructed with and the prompt it was run
+    on, so tests can assert config round-trips correctly.
+    """
+
+    last_instance: "_FakeClaudeAgent | None" = None
+
+    def __init__(self, definition: _FakeAgentDefinition) -> None:
+        self.definition = definition
+        self.run = AsyncMock(return_value=_FakeResponse())
+        _FakeClaudeAgent.last_instance = self
+
+
+def _install_fake_sdk(monkeypatch: pytest.MonkeyPatch, response: _FakeResponse | None = None) -> type:
+    """Install a fake ``claude_agent_sdk`` module into ``sys.modules``.
+
+    Returns the fake ClaudeAgent class so tests can inspect it.
+    """
+
+    fake_module = ModuleType("claude_agent_sdk")
+
+    if response is not None:
+        class _Agent(_FakeClaudeAgent):
+            def __init__(self, definition: _FakeAgentDefinition) -> None:
+                super().__init__(definition)
+                self.run = AsyncMock(return_value=response)
+        agent_cls: type = _Agent
+    else:
+        agent_cls = _FakeClaudeAgent
+
+    fake_module.ClaudeAgent = agent_cls  # type: ignore[attr-defined]
+    fake_module.AgentDefinition = _FakeAgentDefinition  # type: ignore[attr-defined]
+
+    monkeypatch.setitem(sys.modules, "claude_agent_sdk", fake_module)
+    return agent_cls
+
+
+def _block_sdk_import(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Make ``import claude_agent_sdk`` raise ImportError inside the runtime."""
+
+    # Drop any cached copy and shadow with a meta path finder that refuses it.
+    monkeypatch.delitem(sys.modules, "claude_agent_sdk", raising=False)
+
+    import importlib.abc
+    import importlib.machinery
+
+    class _Blocker(importlib.abc.MetaPathFinder):
+        def find_spec(self, name: str, path: Any = None, target: Any = None) -> Any:
+            if name == "claude_agent_sdk":
+                raise ImportError("blocked for test")
+            return None
+
+    blocker = _Blocker()
+    monkeypatch.setattr(sys, "meta_path", [blocker, *sys.meta_path])
+
+
+# ---------------------------------------------------------------------------
+# Tests.
+# ---------------------------------------------------------------------------
+
+
+def test_module_importable_without_sdk(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Lazy import: re-importing the runtime works even when SDK is missing."""
+
+    _block_sdk_import(monkeypatch)
+    monkeypatch.delitem(sys.modules, "sandcastle.engine.agent_sdk_runtime", raising=False)
+
+    module = importlib.import_module("sandcastle.engine.agent_sdk_runtime")
+    assert hasattr(module, "AgentSDKRunner")
+    assert module.is_available() is False
+
+
+def test_is_available_false_when_sdk_missing(monkeypatch: pytest.MonkeyPatch) -> None:
+    _block_sdk_import(monkeypatch)
+    assert is_available() is False
+
+
+def test_is_available_true_when_sdk_present(monkeypatch: pytest.MonkeyPatch) -> None:
+    _install_fake_sdk(monkeypatch)
+    assert is_available() is True
+
+
+@pytest.mark.asyncio
+async def test_run_raises_when_sdk_missing(monkeypatch: pytest.MonkeyPatch) -> None:
+    _block_sdk_import(monkeypatch)
+    runner = AgentSDKRunner()
+    with pytest.raises(AgentSDKNotInstalled):
+        await runner.run("hi", AgentSDKConfig())
+
+
+@pytest.mark.asyncio
+async def test_run_invokes_sdk_with_config(monkeypatch: pytest.MonkeyPatch) -> None:
+    agent_cls = _install_fake_sdk(monkeypatch)
+
+    cfg = AgentSDKConfig(
+        model="claude-sonnet-4-6",
+        system_prompt="be helpful",
+        tools=[{"name": "search"}],
+        mcp_servers={"local": "http://localhost:9000"},
+        permission_mode="auto",
+        skills_dir="/tmp/skills",
+        commands_dir="/tmp/cmds",
+        working_dir="/tmp/work",
+        max_turns=7,
+        timeout_seconds=60,
+    )
+
+    runner = AgentSDKRunner()
+    result = await runner.run("hello there", cfg)
+
+    assert isinstance(result, AgentSDKResult)
+    instance = agent_cls.last_instance  # type: ignore[attr-defined]
+    assert instance is not None
+    assert instance.definition.kwargs["model"] == "claude-sonnet-4-6"
+    assert instance.definition.kwargs["system_prompt"] == "be helpful"
+    assert instance.definition.kwargs["tools"] == [{"name": "search"}]
+    assert instance.definition.kwargs["mcp_servers"] == {"local": "http://localhost:9000"}
+    assert instance.definition.kwargs["max_turns"] == 7
+    instance.run.assert_awaited_once_with("hello there")
+
+
+@pytest.mark.asyncio
+async def test_result_parsed_from_sdk_response(monkeypatch: pytest.MonkeyPatch) -> None:
+    response = _FakeResponse(
+        output="final answer",
+        tool_calls=[{"name": "bash", "input": {"cmd": "ls"}}],
+        cost_usd=0.0123,
+        transcript_path="/tmp/transcript.jsonl",
+        error=None,
+    )
+    _install_fake_sdk(monkeypatch, response=response)
+
+    runner = AgentSDKRunner()
+    result = await runner.run("go", AgentSDKConfig())
+
+    assert result.output == "final answer"
+    assert result.tool_calls == [{"name": "bash", "input": {"cmd": "ls"}}]
+    assert result.cost_usd == pytest.approx(0.0123)
+    assert result.transcript_path == "/tmp/transcript.jsonl"
+    assert result.error is None
+    assert result.duration_ms >= 0
+
+
+@pytest.mark.asyncio
+async def test_cost_reported_back_from_result(monkeypatch: pytest.MonkeyPatch) -> None:
+    response = _FakeResponse(output="ok", cost_usd=2.5)
+    _install_fake_sdk(monkeypatch, response=response)
+
+    runner = AgentSDKRunner()
+    result = await runner.run("compute", AgentSDKConfig())
+
+    assert result.cost_usd == pytest.approx(2.5)
+
+
+def test_validate_config_rejects_non_positive_max_turns() -> None:
+    errors = validate_config(AgentSDKConfig(max_turns=0))
+    assert any("max_turns" in e for e in errors)
+
+    errors = validate_config(AgentSDKConfig(max_turns=-3))
+    assert any("max_turns" in e for e in errors)
+
+
+def test_validate_config_rejects_non_positive_timeout() -> None:
+    errors = validate_config(AgentSDKConfig(timeout_seconds=0))
+    assert any("timeout_seconds" in e for e in errors)
+
+    errors = validate_config(AgentSDKConfig(timeout_seconds=-1))
+    assert any("timeout_seconds" in e for e in errors)
+
+
+def test_validate_config_rejects_skills_scheme_without_dir() -> None:
+    cfg = AgentSDKConfig(mcp_servers={"s": "skills://my-skill"}, skills_dir=None)
+    errors = validate_config(cfg)
+    assert any("skills_dir" in e for e in errors)
+
+
+def test_validate_config_accepts_valid_defaults() -> None:
+    assert validate_config(AgentSDKConfig()) == []
+
+
+@pytest.mark.parametrize("mode", ["auto", "prompt", "read_only"])
+@pytest.mark.asyncio
+async def test_permission_mode_round_trips(
+    monkeypatch: pytest.MonkeyPatch, mode: str
+) -> None:
+    agent_cls = _install_fake_sdk(monkeypatch)
+    runner = AgentSDKRunner()
+
+    cfg = AgentSDKConfig(permission_mode=mode)  # type: ignore[arg-type]
+    await runner.run("ok", cfg)
+
+    instance = agent_cls.last_instance  # type: ignore[attr-defined]
+    assert instance is not None
+    assert instance.definition.kwargs["permission_mode"] == mode
+
+
+@pytest.mark.asyncio
+async def test_run_with_invalid_config_raises_config_error(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    # SDK present so the config check runs first.
+    _install_fake_sdk(monkeypatch)
+    runner = AgentSDKRunner()
+    with pytest.raises(AgentSDKConfigError):
+        await runner.run("hi", AgentSDKConfig(max_turns=0))
+
+
+def test_runner_name_attribute() -> None:
+    assert AgentSDKRunner.name == "agent-sdk"
diff --git a/tests/test_agent_skills.py b/tests/test_agent_skills.py
new file mode 100644
index 00000000..62fd9dbc
--- /dev/null
+++ b/tests/test_agent_skills.py
@@ -0,0 +1,277 @@
+"""Tests for the Anthropic Agent Skills publisher (engine.agent_skills)."""
+
+from __future__ import annotations
+
+import io
+import tarfile
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from sandcastle.engine.agent_skills import (
+    DEFAULT_BETA_HEADERS,
+    AnthropicSkillsClient,
+    SkillFrontmatter,
+    SkillPackage,
+    SkillValidationError,
+    parse_skill,
+    publish_workflows_as_skills,
+    serialize_skill,
+    workflow_to_skill,
+)
+
+
+# ---------------------------------------------------------------------------
+# Frontmatter validation
+# ---------------------------------------------------------------------------
+
+
+def test_frontmatter_rejects_reserved_anthropic_token() -> None:
+    with pytest.raises(SkillValidationError):
+        SkillFrontmatter(name="anthropic-foo", description="A reserved name.")
+
+
+def test_frontmatter_accepts_simple_name() -> None:
+    fm = SkillFrontmatter(name="researcher", description="A research helper.")
+    assert fm.name == "researcher"
+    assert fm.version == "1.0.0"
+
+
+def test_frontmatter_rejects_long_description() -> None:
+    long_desc = "x" * 1025
+    with pytest.raises(SkillValidationError):
+        SkillFrontmatter(name="researcher", description=long_desc)
+
+
+def test_frontmatter_rejects_reserved_claude_token() -> None:
+    with pytest.raises(SkillValidationError):
+        SkillFrontmatter(name="claude-helper", description="reserved")
+
+
+def test_frontmatter_rejects_uppercase_name() -> None:
+    with pytest.raises(SkillValidationError):
+        SkillFrontmatter(name="Researcher", description="bad casing")
+
+
+# ---------------------------------------------------------------------------
+# workflow_to_skill
+# ---------------------------------------------------------------------------
+
+
+_MINIMAL_WORKFLOW = """
+name: "Lead Enrichment"
+description: "Enrich a company lead with industry data and a sales score."
+input_schema:
+  required: ["company"]
+  properties:
+    company:
+      type: string
+      description: "Company name or URL"
+steps:
+  - id: "extract"
+    prompt: "..."
+  - id: "score"
+    prompt: "..."
+    depends_on: ["extract"]
+""".strip()
+
+
+def test_workflow_to_skill_produces_valid_frontmatter() -> None:
+    package = workflow_to_skill(_MINIMAL_WORKFLOW)
+    assert package.frontmatter.name == "lead-enrichment"
+    assert "Enrich" in package.frontmatter.description
+    assert len(package.frontmatter.description) <= 1024
+    # Body should mention each step id and input.
+    assert "extract" in package.body
+    assert "score" in package.body
+    assert "company" in package.body
+
+
+def test_workflow_to_skill_handles_reserved_tokens_in_name() -> None:
+    yaml_text = """
+name: "Anthropic Helper"
+description: "Should slugify around the reserved token."
+""".strip()
+    package = workflow_to_skill(yaml_text)
+    # Slugify must strip "anthropic" before validation runs.
+    assert "anthropic" not in package.frontmatter.name
+    assert "claude" not in package.frontmatter.name
+    assert package.frontmatter.name  # non-empty
+
+
+# ---------------------------------------------------------------------------
+# serialize / parse round-trip
+# ---------------------------------------------------------------------------
+
+
+def test_serialize_skill_roundtrips_through_parse_skill() -> None:
+    original = SkillPackage(
+        frontmatter=SkillFrontmatter(
+            name="researcher",
+            description="A research helper.",
+            version="2.1.0",
+            model="sonnet",
+            allowed_tools=["bash", "edit"],
+        ),
+        body="# Researcher\n\nHelpful body text.",
+    )
+    blob = serialize_skill(original)
+    parsed = parse_skill(blob)
+    assert parsed.frontmatter.name == "researcher"
+    assert parsed.frontmatter.description == "A research helper."
+    assert parsed.frontmatter.version == "2.1.0"
+    assert parsed.frontmatter.model == "sonnet"
+    assert parsed.frontmatter.allowed_tools == ["bash", "edit"]
+    assert "Helpful body text." in parsed.body
+
+
+def test_serialize_skill_archive_is_valid_targz() -> None:
+    package = workflow_to_skill(_MINIMAL_WORKFLOW)
+    blob = serialize_skill(package)
+    with tarfile.open(fileobj=io.BytesIO(blob), mode="r:gz") as tar:
+        names = tar.getnames()
+    assert "SKILL.md" in names
+
+
+def test_serialize_skill_includes_bundled_files() -> None:
+    package = SkillPackage(
+        frontmatter=SkillFrontmatter(name="bundler", description="Has extras."),
+        body="# Bundler",
+        bundled_files={
+            "reference/notes.txt": b"hello world",
+            "data/sample.json": b'{"a": 1}',
+        },
+    )
+    blob = serialize_skill(package)
+    parsed = parse_skill(blob)
+    assert parsed.bundled_files["reference/notes.txt"] == b"hello world"
+    assert parsed.bundled_files["data/sample.json"] == b'{"a": 1}'
+
+
+# ---------------------------------------------------------------------------
+# parse_skill error handling
+# ---------------------------------------------------------------------------
+
+
+def test_parse_skill_rejects_malformed_archive() -> None:
+    with pytest.raises(SkillValidationError):
+        parse_skill(b"not-a-tar-gz-blob")
+
+
+def test_parse_skill_rejects_archive_without_skill_md() -> None:
+    buf = io.BytesIO()
+    with tarfile.open(fileobj=buf, mode="w:gz") as tar:
+        info = tarfile.TarInfo(name="other.txt")
+        info.size = 3
+        tar.addfile(info, io.BytesIO(b"abc"))
+    with pytest.raises(SkillValidationError):
+        parse_skill(buf.getvalue())
+
+
+# ---------------------------------------------------------------------------
+# AnthropicSkillsClient
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_client_upload_sends_all_beta_headers_and_multipart() -> None:
+    package = SkillPackage(
+        frontmatter=SkillFrontmatter(name="researcher", description="Helps."),
+        body="# Researcher",
+    )
+
+    mock_response = MagicMock()
+    mock_response.raise_for_status = MagicMock()
+    mock_response.json = MagicMock(return_value={"id": "skill_123", "name": "researcher"})
+
+    mock_client = MagicMock()
+    mock_client.post = AsyncMock(return_value=mock_response)
+    mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+    mock_client.__aexit__ = AsyncMock(return_value=None)
+
+    with patch(
+        "sandcastle.engine.agent_skills.httpx.AsyncClient",
+        return_value=mock_client,
+    ):
+        client = AnthropicSkillsClient(api_key="sk-test")
+        result = await client.upload(package)
+
+    assert result == {"id": "skill_123", "name": "researcher"}
+    mock_client.post.assert_awaited_once()
+    call = mock_client.post.await_args
+    # URL
+    assert call.args[0].endswith("/v1/skills")
+    # Headers contain all three beta flags
+    headers = call.kwargs["headers"]
+    beta_value = headers["anthropic-beta"]
+    for flag in DEFAULT_BETA_HEADERS:
+        assert flag in beta_value
+    assert headers["x-api-key"] == "sk-test"
+    # Multipart payload present
+    files = call.kwargs["files"]
+    assert "skill" in files
+    filename, blob, mime = files["skill"]
+    assert filename == "researcher.tar.gz"
+    assert mime == "application/gzip"
+    assert blob.startswith(b"\x1f\x8b")  # gzip magic
+
+
+@pytest.mark.asyncio
+async def test_client_list_skills_unwraps_data_key() -> None:
+    mock_response = MagicMock()
+    mock_response.raise_for_status = MagicMock()
+    mock_response.json = MagicMock(return_value={"data": [{"id": "a"}, {"id": "b"}]})
+
+    mock_client = MagicMock()
+    mock_client.get = AsyncMock(return_value=mock_response)
+    mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+    mock_client.__aexit__ = AsyncMock(return_value=None)
+
+    with patch(
+        "sandcastle.engine.agent_skills.httpx.AsyncClient",
+        return_value=mock_client,
+    ):
+        client = AnthropicSkillsClient(api_key="sk-test")
+        skills = await client.list_skills()
+
+    assert skills == [{"id": "a"}, {"id": "b"}]
+
+
+# ---------------------------------------------------------------------------
+# CLI helper
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_publish_workflows_dry_run_does_not_call_upload(tmp_path) -> None:
+    wf = tmp_path / "demo.yaml"
+    wf.write_text(_MINIMAL_WORKFLOW, encoding="utf-8")
+
+    client = MagicMock()
+    client.upload = AsyncMock()
+
+    results = await publish_workflows_as_skills(
+        str(tmp_path), dry_run=True, client=client
+    )
+
+    assert len(results) == 1
+    assert results[0]["status"] == "dry_run"
+    assert results[0]["name"] == "lead-enrichment"
+    client.upload.assert_not_awaited()
+
+
+@pytest.mark.asyncio
+async def test_publish_workflows_uploads_when_not_dry_run(tmp_path) -> None:
+    wf = tmp_path / "demo.yaml"
+    wf.write_text(_MINIMAL_WORKFLOW, encoding="utf-8")
+
+    client = MagicMock()
+    client.upload = AsyncMock(return_value={"id": "skill_ok"})
+
+    results = await publish_workflows_as_skills(
+        str(tmp_path), dry_run=False, client=client
+    )
+
+    assert results[0]["status"] == "uploaded"
+    assert results[0]["response"] == {"id": "skill_ok"}
+    client.upload.assert_awaited_once()
diff --git a/tests/test_agent_webhooks.py b/tests/test_agent_webhooks.py
new file mode 100644
index 00000000..dde5c354
--- /dev/null
+++ b/tests/test_agent_webhooks.py
@@ -0,0 +1,339 @@
+"""Tests for the Anthropic Managed Agents webhook subscriber + handler."""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import hmac
+import json
+import time
+from typing import Any
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import httpx
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+
+from sandcastle.api import agent_webhooks
+from sandcastle.api.agent_webhooks import (
+    AGENT_WEBHOOK_HANDLERS,
+    ANTHROPIC_BETA_HEADER,
+    SUPPORTED_EVENTS,
+    AnthropicWebhookSubscription,
+    register_handler,
+    router,
+    verify_signature,
+)
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def app() -> FastAPI:
+    app = FastAPI()
+    app.include_router(router)
+    return app
+
+
+@pytest.fixture
+def client(app: FastAPI) -> TestClient:
+    return TestClient(app)
+
+
+@pytest.fixture(autouse=True)
+def _clear_handlers():
+    """Reset the global handler registry around each test."""
+    snapshot = {k: list(v) for k, v in AGENT_WEBHOOK_HANDLERS.items()}
+    for k in list(AGENT_WEBHOOK_HANDLERS.keys()):
+        AGENT_WEBHOOK_HANDLERS[k] = []
+    yield
+    AGENT_WEBHOOK_HANDLERS.clear()
+    AGENT_WEBHOOK_HANDLERS.update(snapshot)
+
+
+def _sign(secret: str, body: bytes) -> str:
+    return hmac.new(secret.encode(), body, hashlib.sha256).hexdigest()
+
+
+# ---------------------------------------------------------------------------
+# Signature verification
+# ---------------------------------------------------------------------------
+
+
+def test_valid_signature_accepted(client: TestClient, monkeypatch):
+    secret = "topsecret"
+    monkeypatch.setenv("ANTHROPIC_WEBHOOK_SECRET", secret)
+    payload = {"type": "session.status_idle", "session_id": "s1"}
+    body = json.dumps(payload).encode()
+    sig = _sign(secret, body)
+
+    resp = client.post(
+        "/agent-webhooks/anthropic",
+        content=body,
+        headers={
+            "X-Anthropic-Signature": sig,
+            "content-type": "application/json",
+        },
+    )
+    assert resp.status_code == 200
+    assert resp.json()["status"] == "accepted"
+
+
+def test_invalid_signature_rejected(client: TestClient, monkeypatch):
+    monkeypatch.setenv("ANTHROPIC_WEBHOOK_SECRET", "realsecret")
+    payload = {"type": "session.status_idle"}
+    body = json.dumps(payload).encode()
+    bad_sig = _sign("wrongsecret", body)
+
+    resp = client.post(
+        "/agent-webhooks/anthropic",
+        content=body,
+        headers={
+            "X-Anthropic-Signature": bad_sig,
+            "content-type": "application/json",
+        },
+    )
+    assert resp.status_code == 401
+
+
+def test_signature_with_sha256_prefix(client: TestClient, monkeypatch):
+    secret = "abc"
+    monkeypatch.setenv("ANTHROPIC_WEBHOOK_SECRET", secret)
+    body = json.dumps({"type": "session.status_running"}).encode()
+    sig = "sha256=" + _sign(secret, body)
+    resp = client.post(
+        "/agent-webhooks/anthropic",
+        content=body,
+        headers={"X-Anthropic-Signature": sig},
+    )
+    assert resp.status_code == 200
+
+
+# ---------------------------------------------------------------------------
+# Local-mode vs production behaviour when secret is missing
+# ---------------------------------------------------------------------------
+
+
+def test_local_mode_without_secret_accepted_with_warning(
+    client: TestClient, monkeypatch, caplog
+):
+    monkeypatch.delenv("ANTHROPIC_WEBHOOK_SECRET", raising=False)
+    monkeypatch.setattr(agent_webhooks, "settings", SimpleNamespace(is_local_mode=True))
+
+    body = json.dumps({"type": "session.status_idle"}).encode()
+    with caplog.at_level("WARNING"):
+        resp = client.post(
+            "/agent-webhooks/anthropic",
+            content=body,
+            headers={"content-type": "application/json"},
+        )
+    assert resp.status_code == 200
+    assert any("skipping signature verify" in r.message for r in caplog.records)
+
+
+def test_production_mode_without_secret_rejected(client: TestClient, monkeypatch):
+    monkeypatch.delenv("ANTHROPIC_WEBHOOK_SECRET", raising=False)
+    monkeypatch.setattr(agent_webhooks, "settings", SimpleNamespace(is_local_mode=False))
+
+    body = json.dumps({"type": "session.status_idle"}).encode()
+    resp = client.post(
+        "/agent-webhooks/anthropic",
+        content=body,
+        headers={"content-type": "application/json"},
+    )
+    assert resp.status_code == 401
+
+
+# ---------------------------------------------------------------------------
+# Per-event dispatch
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("event_type", list(SUPPORTED_EVENTS))
+def test_event_dispatched_to_registered_handler(
+    client: TestClient, monkeypatch, event_type: str
+):
+    monkeypatch.delenv("ANTHROPIC_WEBHOOK_SECRET", raising=False)
+    monkeypatch.setattr(agent_webhooks, "settings", SimpleNamespace(is_local_mode=True))
+
+    received: list[dict[str, Any]] = []
+
+    async def handler(event: dict[str, Any]) -> None:
+        received.append(event)
+
+    register_handler(event_type, handler)
+
+    payload = {"type": event_type, "id": f"evt-{event_type}"}
+    body = json.dumps(payload).encode()
+
+    resp = client.post(
+        "/agent-webhooks/anthropic",
+        content=body,
+        headers={"content-type": "application/json"},
+    )
+    assert resp.status_code == 200
+
+    # Give the asyncio.create_task a chance to run.
+    for _ in range(20):
+        if received:
+            break
+        time.sleep(0.02)
+
+    assert len(received) == 1
+    assert received[0]["type"] == event_type
+
+
+def test_register_handler_accumulates_multiple_handlers(
+    client: TestClient, monkeypatch
+):
+    monkeypatch.delenv("ANTHROPIC_WEBHOOK_SECRET", raising=False)
+    monkeypatch.setattr(agent_webhooks, "settings", SimpleNamespace(is_local_mode=True))
+
+    calls: list[str] = []
+
+    async def h1(event):
+        calls.append("h1")
+
+    async def h2(event):
+        calls.append("h2")
+
+    register_handler("session.error", h1)
+    register_handler("session.error", h2)
+    assert len(AGENT_WEBHOOK_HANDLERS["session.error"]) == 2
+
+    body = json.dumps({"type": "session.error"}).encode()
+    resp = client.post(
+        "/agent-webhooks/anthropic",
+        content=body,
+        headers={"content-type": "application/json"},
+    )
+    assert resp.status_code == 200
+
+    for _ in range(25):
+        if len(calls) == 2:
+            break
+        time.sleep(0.02)
+    assert sorted(calls) == ["h1", "h2"]
+
+
+# ---------------------------------------------------------------------------
+# Slow handler must not block ACK
+# ---------------------------------------------------------------------------
+
+
+def test_slow_handler_does_not_delay_ack(client: TestClient, monkeypatch):
+    monkeypatch.delenv("ANTHROPIC_WEBHOOK_SECRET", raising=False)
+    monkeypatch.setattr(agent_webhooks, "settings", SimpleNamespace(is_local_mode=True))
+
+    async def slow(event):
+        await asyncio.sleep(0.5)
+
+    register_handler("session.status_running", slow)
+
+    body = json.dumps({"type": "session.status_running"}).encode()
+    start = time.perf_counter()
+    resp = client.post(
+        "/agent-webhooks/anthropic",
+        content=body,
+        headers={"content-type": "application/json"},
+    )
+    elapsed = time.perf_counter() - start
+
+    assert resp.status_code == 200
+    assert elapsed < 0.3, f"ACK took too long: {elapsed:.3f}s"
+
+
+# ---------------------------------------------------------------------------
+# AnthropicWebhookSubscription client
+# ---------------------------------------------------------------------------
+
+
+def _mock_response(status_code: int, body: Any) -> MagicMock:
+    resp = MagicMock(spec=httpx.Response)
+    resp.status_code = status_code
+    resp.json.return_value = body
+    resp.raise_for_status = MagicMock()
+    return resp
+
+
+def test_create_subscription_posts_correct_body_and_beta_header():
+    mock_client = MagicMock(spec=httpx.AsyncClient)
+    mock_client.post = AsyncMock(
+        return_value=_mock_response(
+            200, {"id": "sub_1", "url": "https://x/cb", "events": ["session.status_idle"]}
+        )
+    )
+
+    sub = AnthropicWebhookSubscription(api_key="key", client=mock_client)
+    result = asyncio.run(
+        sub.create_subscription(
+            callback_url="https://x/cb",
+            events=["session.status_idle"],
+            secret="shh",
+        )
+    )
+    assert result["id"] == "sub_1"
+
+    mock_client.post.assert_awaited_once()
+    args, kwargs = mock_client.post.call_args
+    assert args[0] == "/v1/webhooks"
+    assert kwargs["json"] == {
+        "url": "https://x/cb",
+        "events": ["session.status_idle"],
+        "secret": "shh",
+    }
+    headers = kwargs["headers"]
+    assert headers["anthropic-beta"] == ANTHROPIC_BETA_HEADER
+    assert headers["x-api-key"] == "key"
+    assert headers["content-type"] == "application/json"
+
+
+def test_create_subscription_omits_secret_when_not_provided():
+    mock_client = MagicMock(spec=httpx.AsyncClient)
+    mock_client.post = AsyncMock(return_value=_mock_response(200, {"id": "sub_2"}))
+
+    sub = AnthropicWebhookSubscription(api_key="k", client=mock_client)
+    asyncio.run(
+        sub.create_subscription(
+            callback_url="https://x/cb", events=["session.status_running"]
+        )
+    )
+    _, kwargs = mock_client.post.call_args
+    assert "secret" not in kwargs["json"]
+
+
+def test_list_and_delete_subscription():
+    mock_client = MagicMock(spec=httpx.AsyncClient)
+    mock_client.get = AsyncMock(
+        return_value=_mock_response(200, {"data": [{"id": "sub_a"}, {"id": "sub_b"}]})
+    )
+    mock_client.delete = AsyncMock(return_value=_mock_response(204, {}))
+
+    sub = AnthropicWebhookSubscription(api_key="k", client=mock_client)
+    items = asyncio.run(sub.list_subscriptions())
+    assert [s["id"] for s in items] == ["sub_a", "sub_b"]
+
+    asyncio.run(sub.delete_subscription("sub_a"))
+    mock_client.delete.assert_awaited_once()
+    args, _ = mock_client.delete.call_args
+    assert args[0] == "/v1/webhooks/sub_a"
+
+
+# ---------------------------------------------------------------------------
+# verify_signature unit
+# ---------------------------------------------------------------------------
+
+
+def test_verify_signature_unit():
+    body = b'{"hello": "world"}'
+    secret = "s3cr3t"
+    sig = _sign(secret, body)
+    assert verify_signature(secret, body, sig) is True
+    assert verify_signature(secret, body, "sha256=" + sig) is True
+    assert verify_signature(secret, body, "deadbeef") is False
+    assert verify_signature(secret, body, "") is False
diff --git a/tests/test_computer_use.py b/tests/test_computer_use.py
new file mode 100644
index 00000000..5e9e23c3
--- /dev/null
+++ b/tests/test_computer_use.py
@@ -0,0 +1,132 @@
+"""Tests for the Computer Use integration helper."""
+
+from __future__ import annotations
+
+import pytest
+
+from sandcastle.engine.computer_use import (
+    BETA_HEADER_2025_01_24,
+    BETA_HEADER_2025_11_24,
+    SAFETY_CHECKLIST,
+    ComputerUseConfig,
+    build_beta_header,
+    build_tool_definitions,
+    should_pause_for_approval,
+    validate_config,
+)
+
+
+# ---------------------------------------------------------------------------
+# validate_config
+# ---------------------------------------------------------------------------
+
+
+def test_validate_config_rejects_display_width_too_large() -> None:
+    config = ComputerUseConfig(display_width_px=2600)
+    issues = validate_config(config)
+    assert any("exceeds maximum" in issue for issue in issues)
+
+
+def test_validate_config_rejects_display_width_too_small() -> None:
+    config = ComputerUseConfig(display_width_px=320)
+    issues = validate_config(config)
+    assert any("below minimum" in issue for issue in issues)
+
+
+def test_validate_config_rejects_empty_tools() -> None:
+    config = ComputerUseConfig(tools=[])
+    issues = validate_config(config)
+    assert any("tools list is empty" in issue for issue in issues)
+
+
+def test_validate_config_warns_on_empty_approval_list() -> None:
+    config = ComputerUseConfig(require_human_approval_for=[])
+    issues = validate_config(config)
+    warn = [issue for issue in issues if issue.startswith("WARN:")]
+    assert warn, "expected a WARN: entry for empty approval list"
+    assert "require_human_approval_for" in warn[0]
+
+
+def test_validate_config_clean_default() -> None:
+    config = ComputerUseConfig()
+    assert validate_config(config) == []
+
+
+# ---------------------------------------------------------------------------
+# build_tool_definitions
+# ---------------------------------------------------------------------------
+
+
+def test_build_tool_definitions_bash_shape() -> None:
+    config = ComputerUseConfig(tools=["bash"])
+    defs = build_tool_definitions(config)
+    assert defs == [{"type": "bash_20250124", "name": "bash"}]
+
+
+def test_build_tool_definitions_text_editor_shape() -> None:
+    config = ComputerUseConfig(tools=["text_editor"])
+    defs = build_tool_definitions(config)
+    assert defs == [
+        {"type": "text_editor_20250124", "name": "str_replace_editor"}
+    ]
+
+
+def test_build_tool_definitions_computer_shape() -> None:
+    config = ComputerUseConfig(
+        tools=["computer"], display_width_px=1440, display_height_px=900
+    )
+    defs = build_tool_definitions(config)
+    assert defs == [
+        {
+            "type": "computer_20251124",
+            "name": "computer",
+            "display_width_px": 1440,
+            "display_height_px": 900,
+        }
+    ]
+
+
+# ---------------------------------------------------------------------------
+# build_beta_header
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["claude-opus-4-7", "claude-sonnet-4-6", "claude-haiku-4-5"],
+)
+def test_build_beta_header_new_models(model: str) -> None:
+    assert build_beta_header(model) == BETA_HEADER_2025_11_24
+
+
+def test_build_beta_header_older_sonnet() -> None:
+    assert build_beta_header("claude-sonnet-4-5") == BETA_HEADER_2025_01_24
+
+
+# ---------------------------------------------------------------------------
+# should_pause_for_approval
+# ---------------------------------------------------------------------------
+
+
+def test_should_pause_for_approval_matches_action_name() -> None:
+    config = ComputerUseConfig()  # default includes "mouse_click"
+    event = {"name": "computer", "input": {"action": "mouse_click", "x": 10, "y": 20}}
+    assert should_pause_for_approval(event, config) is True
+
+
+def test_should_pause_for_approval_skips_unlisted_action() -> None:
+    config = ComputerUseConfig(require_human_approval_for=["submit_form"])
+    event = {"name": "computer", "input": {"action": "screenshot"}}
+    assert should_pause_for_approval(event, config) is False
+
+
+# ---------------------------------------------------------------------------
+# SAFETY_CHECKLIST
+# ---------------------------------------------------------------------------
+
+
+def test_safety_checklist_has_enough_items() -> None:
+    assert len(SAFETY_CHECKLIST) >= 6
+    for item in SAFETY_CHECKLIST:
+        assert isinstance(item, str)
+        assert item.strip(), "safety checklist items must be non-empty"
diff --git a/tests/test_executor_deep.py b/tests/test_executor_deep.py
index 83c4800d..a87eda54 100644
--- a/tests/test_executor_deep.py
+++ b/tests/test_executor_deep.py
@@ -1085,13 +1085,13 @@ def test_apply_variant_no_override_keeps_original(self):
         assert result.autopilot is None
 
     def test_dataclasses_replace_count(self):
-        """Sanity check: StepDefinition has 38 fields."""
+        """Sanity check: StepDefinition has 49 fields (v0.32 prep added 2)."""
         import dataclasses as dc
         fields = dc.fields(StepDefinition)
         # If someone adds a field, this test reminds them to check
         # all places that construct StepDefinitions
-        assert len(fields) == 38, (
-            f"StepDefinition has {len(fields)} fields (expected 38). "
+        assert len(fields) == 49, (
+            f"StepDefinition has {len(fields)} fields (expected 49). "
             "If you added a new field, verify all dataclasses.replace() "
             "callers handle it correctly."
         )
diff --git a/tests/test_managed_agent_v30.py b/tests/test_managed_agent_v30.py
index ac9c5e8f..87e943f0 100644
--- a/tests/test_managed_agent_v30.py
+++ b/tests/test_managed_agent_v30.py
@@ -238,8 +238,8 @@ def test_in_non_llm_types(self):
         assert "managed-agent" in NON_LLM_TYPES
 
     def test_step_type_count(self):
-        """VALID_STEP_TYPES should have 22 entries after adding agent."""
-        assert len(VALID_STEP_TYPES) == 22
+        """VALID_STEP_TYPES should have 24 entries (22 + trajectory-replay + computer-use)."""
+        assert len(VALID_STEP_TYPES) == 24
 
 
 # ===================================================================
diff --git a/tests/test_managed_agent_wires.py b/tests/test_managed_agent_wires.py
new file mode 100644
index 00000000..3fad9b70
--- /dev/null
+++ b/tests/test_managed_agent_wires.py
@@ -0,0 +1,529 @@
+"""Tier-1 wire fixes for the managed-agent step.
+
+Covers:
+- tools_enabled forwarding to the agent-create payload
+- temperature / max_tokens / thinking_budget plumbing
+- stream: False collecting events server-side before assembly
+- per-model pricing table with fallback + single warning
+- fallback_template chain (str or list, walked left-to-right, capped)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+import httpx
+
+from sandcastle.engine import executor as _executor_mod
+from sandcastle.engine.dag import ManagedAgentConfig, StepDefinition
+from sandcastle.engine.executor import RunContext
+
+
+_execute_managed_agent_step = _executor_mod._execute_managed_agent_step
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_context() -> RunContext:
+    return RunContext(
+        run_id="run-wires-1",
+        input={"topic": "demo"},
+        step_outputs={},
+        step_results={},
+    )
+
+
+def _clear_caches():
+    _executor_mod._managed_agent_cache.clear()
+    _executor_mod._managed_env_cache.clear()
+    _executor_mod._warned_unknown_agent_models.clear()
+
+
+def _mock_sse_stream(events: list[dict]):
+    lines = [f"data: {json.dumps(e)}" for e in events]
+    lines.append("")
+
+    class FakeStream:
+        async def aiter_lines(self):
+            for line in lines:
+                yield line
+
+    stream_ctx = AsyncMock()
+    stream_ctx.__aenter__ = AsyncMock(return_value=FakeStream())
+    stream_ctx.__aexit__ = AsyncMock(return_value=False)
+    return stream_ctx
+
+
+def _build_mock_client(
+    captured: dict | None = None,
+    sse_events: list[dict] | None = None,
+):
+    """Construct a single AsyncClient mock that records POST payloads."""
+    client = AsyncMock()
+    if captured is None:
+        captured = {}
+    captured.setdefault("agents", [])
+    captured.setdefault("environments", [])
+    captured.setdefault("sessions", [])
+    captured.setdefault("events", [])
+
+    async def mock_post(url, **kwargs):
+        body = kwargs.get("json", {})
+        resp = MagicMock()
+        resp.status_code = 200
+        if "/agents" in url and "/sessions" not in url:
+            captured["agents"].append(body)
+            resp.json.return_value = {"id": "ag_test"}
+        elif "/environments" in url:
+            captured["environments"].append(body)
+            resp.json.return_value = {"id": "env_test"}
+        elif "/sessions" in url and "/events" in url:
+            captured["events"].append(body)
+            resp.json.return_value = {}
+        elif "/sessions" in url:
+            captured["sessions"].append(body)
+            resp.json.return_value = {"id": "sess_test"}
+        else:
+            resp.json.return_value = {"id": "x"}
+        return resp
+
+    async def mock_delete(url, **kwargs):
+        resp = MagicMock()
+        resp.status_code = 200
+        return resp
+
+    events = sse_events if sse_events is not None else [
+        {"type": "agent.message", "content": [{"type": "text", "text": "ok"}]},
+        {"type": "session.status_idle"},
+    ]
+    client.post = AsyncMock(side_effect=mock_post)
+    client.delete = AsyncMock(side_effect=mock_delete)
+    client.stream = MagicMock(return_value=_mock_sse_stream(events))
+    client.__aenter__ = AsyncMock(return_value=client)
+    client.__aexit__ = AsyncMock(return_value=False)
+    return client, captured
+
+
+def _cleanup_client():
+    c = AsyncMock()
+    c.delete = AsyncMock(return_value=MagicMock(status_code=200))
+    c.__aenter__ = AsyncMock(return_value=c)
+    c.__aexit__ = AsyncMock(return_value=False)
+    return c
+
+
+# ---------------------------------------------------------------------------
+# 1. tools_enabled wiring
+# ---------------------------------------------------------------------------
+
+class TestToolsEnabledWiring:
+
+    def setup_method(self):
+        _clear_caches()
+
+    @pytest.mark.asyncio
+    async def test_tools_enabled_list_is_forwarded(self):
+        """When tools_enabled is set, request 'tools' field maps names -> {type: name}."""
+        step = StepDefinition(
+            id="ma-tools",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto",
+                tools_enabled=["bash", "web_search"],
+                message="hi",
+            ),
+        )
+        client, captured = _build_mock_client()
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=[client, _cleanup_client()]):
+                result = await _execute_managed_agent_step(step, _make_context())
+        assert result.status == "completed"
+        assert captured["agents"], "agent-create call should have happened"
+        tools = captured["agents"][0]["tools"]
+        assert tools == [{"type": "bash"}, {"type": "web_search"}]
+
+    @pytest.mark.asyncio
+    async def test_tools_enabled_none_uses_default_toolset(self):
+        """When tools_enabled is None, default managed toolset is used."""
+        step = StepDefinition(
+            id="ma-default",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto", tools_enabled=None, message="hi"
+            ),
+        )
+        client, captured = _build_mock_client()
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=[client, _cleanup_client()]):
+                await _execute_managed_agent_step(step, _make_context())
+        assert captured["agents"][0]["tools"] == [{"type": "agent_toolset_20260401"}]
+
+    @pytest.mark.asyncio
+    async def test_tools_enabled_empty_list_uses_default(self):
+        """An empty list is treated like None - default toolset stays."""
+        step = StepDefinition(
+            id="ma-empty",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto", tools_enabled=[], message="hi"
+            ),
+        )
+        client, captured = _build_mock_client()
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=[client, _cleanup_client()]):
+                await _execute_managed_agent_step(step, _make_context())
+        assert captured["agents"][0]["tools"] == [{"type": "agent_toolset_20260401"}]
+
+
+# ---------------------------------------------------------------------------
+# 2. Sampling params plumbed through agent-create
+# ---------------------------------------------------------------------------
+
+class TestSamplingParams:
+
+    def setup_method(self):
+        _clear_caches()
+
+    @pytest.mark.asyncio
+    async def test_all_sampling_fields_forwarded(self):
+        step = StepDefinition(
+            id="ma-samp",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto",
+                message="hi",
+                temperature=0.4,
+                max_tokens=2048,
+                thinking_budget=8000,
+            ),
+        )
+        client, captured = _build_mock_client()
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=[client, _cleanup_client()]):
+                await _execute_managed_agent_step(step, _make_context())
+        body = captured["agents"][0]
+        assert body["temperature"] == 0.4
+        assert body["max_tokens"] == 2048
+        assert body["thinking"] == {"type": "enabled", "budget_tokens": 8000}
+
+    @pytest.mark.asyncio
+    async def test_none_sampling_fields_omitted(self):
+        """When fields are None, agent payload must not include them."""
+        step = StepDefinition(
+            id="ma-samp-none",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(agent_id="auto", message="hi"),
+        )
+        client, captured = _build_mock_client()
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=[client, _cleanup_client()]):
+                await _execute_managed_agent_step(step, _make_context())
+        body = captured["agents"][0]
+        assert "temperature" not in body
+        assert "max_tokens" not in body
+        assert "thinking" not in body
+
+    @pytest.mark.asyncio
+    async def test_partial_sampling_fields(self):
+        """Only the set fields appear in payload; the rest are omitted."""
+        step = StepDefinition(
+            id="ma-samp-part",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto", message="hi", temperature=0.0
+            ),
+        )
+        client, captured = _build_mock_client()
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=[client, _cleanup_client()]):
+                await _execute_managed_agent_step(step, _make_context())
+        body = captured["agents"][0]
+        assert body["temperature"] == 0.0
+        assert "max_tokens" not in body
+        assert "thinking" not in body
+
+
+# ---------------------------------------------------------------------------
+# 3. stream: False buffers events server-side
+# ---------------------------------------------------------------------------
+
+class TestStreamCollection:
+
+    def setup_method(self):
+        _clear_caches()
+
+    @pytest.mark.asyncio
+    async def test_stream_false_returns_final_text_only(self):
+        """With stream=False, all events buffer first; final text is the concatenation."""
+        step = StepDefinition(
+            id="ma-buf",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto", message="hi", stream=False
+            ),
+        )
+        sse = [
+            {"type": "agent.message",
+             "content": [{"type": "text", "text": "part-A "}],
+             "usage": {"input_tokens": 10, "output_tokens": 5}},
+            {"type": "agent.message",
+             "content": [{"type": "text", "text": "part-B"}],
+             "usage": {"input_tokens": 0, "output_tokens": 3}},
+            {"type": "session.status_idle"},
+        ]
+        client, _ = _build_mock_client(sse_events=sse)
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=[client, _cleanup_client()]):
+                result = await _execute_managed_agent_step(step, _make_context())
+        assert result.status == "completed"
+        assert result.output == "part-A part-B"
+        # Cost computed from tokens regardless of mode
+        assert result.cost_usd > 0
+
+    @pytest.mark.asyncio
+    async def test_stream_true_default_still_works(self):
+        """Default stream=True still collects text incrementally."""
+        step = StepDefinition(
+            id="ma-stream",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(agent_id="auto", message="hi"),
+        )
+        sse = [
+            {"type": "agent.message", "content": [{"type": "text", "text": "x"}]},
+            {"type": "agent.message", "content": [{"type": "text", "text": "y"}]},
+            {"type": "session.status_idle"},
+        ]
+        client, _ = _build_mock_client(sse_events=sse)
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=[client, _cleanup_client()]):
+                result = await _execute_managed_agent_step(step, _make_context())
+        assert result.output == "xy"
+
+    def test_stream_default_is_true(self):
+        """Default value preserves backward compat."""
+        assert ManagedAgentConfig().stream is True
+
+
+# ---------------------------------------------------------------------------
+# 4. Pricing table
+# ---------------------------------------------------------------------------
+
+class TestPricingTable:
+
+    def setup_method(self):
+        _clear_caches()
+
+    def test_known_model_returns_table_value(self):
+        assert _executor_mod._agent_model_pricing("claude-opus-4-7") == (5.0, 25.0)
+        assert _executor_mod._agent_model_pricing("claude-haiku-4-5") == (1.0, 5.0)
+        assert _executor_mod._agent_model_pricing("claude-sonnet-4-6") == (3.0, 15.0)
+
+    def test_unknown_model_falls_back_to_sonnet(self):
+        price = _executor_mod._agent_model_pricing("claude-future-99")
+        assert price == _executor_mod._AGENT_PRICING_FALLBACK
+        assert price == (3.0, 15.0)
+
+    def test_unknown_model_warns_once_per_process(self, caplog):
+        with caplog.at_level(logging.WARNING, logger="sandcastle.engine.executor"):
+            _executor_mod._agent_model_pricing("model-zzz")
+            _executor_mod._agent_model_pricing("model-zzz")
+            _executor_mod._agent_model_pricing("model-zzz")
+        zzz_warnings = [r for r in caplog.records if "model-zzz" in r.getMessage()]
+        assert len(zzz_warnings) == 1
+
+    @pytest.mark.asyncio
+    async def test_cost_uses_model_pricing(self):
+        """Cost computed for opus-4-7 uses the table (5/25), not Sonnet defaults."""
+        step = StepDefinition(
+            id="ma-price",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto", message="hi", model="claude-opus-4-7"
+            ),
+        )
+        sse = [
+            {"type": "agent.message",
+             "content": [{"type": "text", "text": "ok"}],
+             "usage": {"input_tokens": 1_000_000, "output_tokens": 1_000_000}},
+            {"type": "session.status_idle"},
+        ]
+        client, _ = _build_mock_client(sse_events=sse)
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=[client, _cleanup_client()]):
+                result = await _execute_managed_agent_step(step, _make_context())
+        # 1M input * $5 + 1M output * $25 = $30
+        assert result.cost_usd == pytest.approx(30.0, rel=1e-3)
+
+
+# ---------------------------------------------------------------------------
+# 5. Fallback chain semantics
+# ---------------------------------------------------------------------------
+
+class TestFallbackChain:
+
+    def setup_method(self):
+        _clear_caches()
+
+    @pytest.mark.asyncio
+    async def test_string_form_still_accepted(self):
+        """Single template string still triggers a one-step fallback chain."""
+        step = StepDefinition(
+            id="ma-fb-str",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto",
+                message="hi",
+                agent_template="researcher",
+                fallback_template="coder",
+            ),
+        )
+        # Primary call times out (httpx exception path triggers fallback chain).
+        primary_client = AsyncMock()
+        primary_client.post = AsyncMock(side_effect=httpx.TimeoutException("timed out"))
+        primary_client.delete = AsyncMock()
+        primary_client.__aenter__ = AsyncMock(return_value=primary_client)
+        primary_client.__aexit__ = AsyncMock(return_value=False)
+
+        fb_client, _ = _build_mock_client()
+        clients = [primary_client, fb_client, _cleanup_client()]
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=clients):
+                result = await _execute_managed_agent_step(step, _make_context())
+        assert result.status == "completed"
+
+    @pytest.mark.asyncio
+    async def test_list_of_two_walked_in_order(self):
+        """First fallback fails, second succeeds; we walk in declared order."""
+        step = StepDefinition(
+            id="ma-fb-chain",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto",
+                message="hi",
+                agent_template="researcher",
+                fallback_template=["coder", "writer"],
+            ),
+        )
+        # Primary fails, first fallback fails, second fallback succeeds.
+        def _failing_client():
+            c = AsyncMock()
+            c.post = AsyncMock(side_effect=httpx.TimeoutException("nope"))
+            c.delete = AsyncMock()
+            c.__aenter__ = AsyncMock(return_value=c)
+            c.__aexit__ = AsyncMock(return_value=False)
+            return c
+
+        fb2_client, _ = _build_mock_client()
+        clients = [
+            _failing_client(),       # primary times out
+            _failing_client(),       # fb1 = coder times out
+            fb2_client,              # fb2 = writer succeeds
+            _cleanup_client(),       # cleanup for fb2 session
+        ]
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=clients):
+                result = await _execute_managed_agent_step(step, _make_context())
+        assert result.status == "completed"
+
+    @pytest.mark.asyncio
+    async def test_primary_success_skips_chain(self):
+        """When the primary completes, fallback list is never invoked."""
+        step = StepDefinition(
+            id="ma-fb-skip",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto",
+                message="hi",
+                agent_template="researcher",
+                fallback_template=["coder", "writer"],
+            ),
+        )
+        primary_client, _ = _build_mock_client()
+        # Only one client + cleanup should be consumed. If the chain ran, we'd
+        # exhaust the side_effect list and StopIteration would surface.
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch(
+                "httpx.AsyncClient",
+                side_effect=[primary_client, _cleanup_client()],
+            ):
+                result = await _execute_managed_agent_step(step, _make_context())
+        assert result.status == "completed"
+
+    @pytest.mark.asyncio
+    async def test_all_fail_returns_last_error(self):
+        """When every link in the chain fails, the last error is surfaced."""
+        step = StepDefinition(
+            id="ma-fb-allfail",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto",
+                message="hi",
+                agent_template="researcher",
+                fallback_template=["coder", "writer"],
+            ),
+        )
+
+        def _failing_client():
+            c = AsyncMock()
+            c.post = AsyncMock(side_effect=httpx.TimeoutException("boom"))
+            c.delete = AsyncMock()
+            c.__aenter__ = AsyncMock(return_value=c)
+            c.__aexit__ = AsyncMock(return_value=False)
+            return c
+
+        clients = [
+            _failing_client(),       # primary
+            _failing_client(),       # fb1 = coder
+            _failing_client(),       # fb2 = writer
+        ]
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=clients):
+                result = await _execute_managed_agent_step(step, _make_context())
+        assert result.status == "failed"
+        assert "Primary failed" in result.error
+        # Last fallback name in the chain surfaces
+        assert "writer" in result.error
+
+    @pytest.mark.asyncio
+    async def test_chain_capped_at_five(self):
+        """Chains longer than five entries are truncated."""
+        step = StepDefinition(
+            id="ma-fb-cap",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto",
+                message="hi",
+                agent_template="researcher",
+                # 7 entries: only the first 5 should be attempted.
+                fallback_template=[
+                    "coder", "writer", "analyst", "reviewer",
+                    "scraper", "tester", "devops",
+                ],
+            ),
+        )
+
+        def _failing_client():
+            c = AsyncMock()
+            c.post = AsyncMock(side_effect=httpx.TimeoutException("boom"))
+            c.delete = AsyncMock()
+            c.__aenter__ = AsyncMock(return_value=c)
+            c.__aexit__ = AsyncMock(return_value=False)
+            return c
+
+        # 1 primary + 5 fallbacks = 6 clients. If the cap is ignored we'd
+        # need 8 and StopIteration would be raised.
+        clients: list[AsyncMock] = [_failing_client() for _ in range(6)]
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch("httpx.AsyncClient", side_effect=clients):
+                result = await _execute_managed_agent_step(step, _make_context())
+        assert result.status == "failed"
+        # Truncation means "scraper" is the last attempted name (5th fallback),
+        # not the trailing "devops".
+        assert "scraper" in result.error
diff --git a/tests/test_memory_stores.py b/tests/test_memory_stores.py
new file mode 100644
index 00000000..8e239b5d
--- /dev/null
+++ b/tests/test_memory_stores.py
@@ -0,0 +1,246 @@
+"""Tests for the Anthropic Memory Stores client.
+
+Uses unittest.mock to patch httpx.AsyncClient so no network is involved.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+import httpx
+import pytest
+
+from sandcastle.engine.memory_stores import (
+    DEFAULT_BETA_HEADER,
+    MAX_MEMORY_FILE_BYTES,
+    MemoryFileTooLargeError,
+    MemoryStoresClient,
+    MemoryStoresConflict,
+    MemoryStoresError,
+    MemoryStoresLimitError,
+    MemoryStoresNotFound,
+)
+
+
+def _make_response(
+    status: int = 200,
+    json_body: Any = None,
+    text: str = "",
+) -> MagicMock:
+    """Build a mock httpx.Response with the fields the client touches."""
+    resp = MagicMock(spec=httpx.Response)
+    resp.status_code = status
+    resp.text = text or ""
+    if json_body is None:
+        json_body = {}
+    resp.json = MagicMock(return_value=json_body)
+    return resp
+
+
+class _CapturingClient:
+    """Async context manager double for httpx.AsyncClient.
+
+    Records the latest .request(...) kwargs on the surrounding test via the
+    `calls` list it is initialised with.
+    """
+
+    def __init__(self, calls: list[dict[str, Any]], response: MagicMock) -> None:
+        self._calls = calls
+        self._response = response
+
+    async def __aenter__(self) -> "_CapturingClient":
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb) -> None:
+        return None
+
+    async def request(self, method: str, url: str, **kwargs: Any) -> MagicMock:
+        self._calls.append({"method": method, "url": url, **kwargs})
+        return self._response
+
+
+def _patch_httpx(response: MagicMock) -> tuple[Any, list[dict[str, Any]]]:
+    """Return a patch context manager and the list that will collect calls."""
+    calls: list[dict[str, Any]] = []
+
+    def _factory(*args: Any, **kwargs: Any) -> _CapturingClient:
+        return _CapturingClient(calls, response)
+
+    return patch("sandcastle.engine.memory_stores.httpx.AsyncClient", _factory), calls
+
+
+@pytest.fixture
+def client() -> MemoryStoresClient:
+    return MemoryStoresClient(api_key="sk-test-123")
+
+
+# ---------------------------------------------------------------------------
+# create_store
+# ---------------------------------------------------------------------------
+async def test_create_store_posts_correct_body(client: MemoryStoresClient) -> None:
+    resp = _make_response(200, {"id": "ms_abc", "name": "scratch"})
+    ctx, calls = _patch_httpx(resp)
+    with ctx:
+        result = await client.create_store("scratch", description="for tests")
+    assert result == {"id": "ms_abc", "name": "scratch"}
+    assert len(calls) == 1
+    call = calls[0]
+    assert call["method"] == "POST"
+    assert call["url"].endswith("/v1/memory_stores")
+    assert call["json"] == {
+        "name": "scratch",
+        "read_only": False,
+        "description": "for tests",
+    }
+
+
+async def test_create_store_sets_beta_and_auth_headers(
+    client: MemoryStoresClient,
+) -> None:
+    resp = _make_response(200, {"id": "ms_abc"})
+    ctx, calls = _patch_httpx(resp)
+    with ctx:
+        await client.create_store("scratch", read_only=True)
+    headers = calls[0]["headers"]
+    assert headers["x-api-key"] == "sk-test-123"
+    assert headers["anthropic-beta"] == DEFAULT_BETA_HEADER
+    assert calls[0]["json"]["read_only"] is True
+    assert "description" not in calls[0]["json"]
+
+
+# ---------------------------------------------------------------------------
+# list / get / delete
+# ---------------------------------------------------------------------------
+async def test_list_stores_unwraps_data_envelope(client: MemoryStoresClient) -> None:
+    resp = _make_response(200, {"data": [{"id": "ms_1"}, {"id": "ms_2"}]})
+    ctx, calls = _patch_httpx(resp)
+    with ctx:
+        stores = await client.list_stores(limit=25)
+    assert stores == [{"id": "ms_1"}, {"id": "ms_2"}]
+    assert calls[0]["method"] == "GET"
+    assert calls[0]["params"] == {"limit": 25}
+
+
+async def test_get_store_returns_payload(client: MemoryStoresClient) -> None:
+    resp = _make_response(200, {"id": "ms_xyz", "name": "demo"})
+    ctx, calls = _patch_httpx(resp)
+    with ctx:
+        store = await client.get_store("ms_xyz")
+    assert store["id"] == "ms_xyz"
+    assert calls[0]["url"].endswith("/v1/memory_stores/ms_xyz")
+
+
+async def test_delete_store_uses_delete_verb(client: MemoryStoresClient) -> None:
+    resp = _make_response(204)
+    ctx, calls = _patch_httpx(resp)
+    with ctx:
+        result = await client.delete_store("ms_xyz")
+    assert result is None
+    assert calls[0]["method"] == "DELETE"
+    assert calls[0]["url"].endswith("/v1/memory_stores/ms_xyz")
+
+
+# ---------------------------------------------------------------------------
+# write_memory
+# ---------------------------------------------------------------------------
+async def test_write_memory_includes_if_match_when_version_given(
+    client: MemoryStoresClient,
+) -> None:
+    resp = _make_response(200, {"path": "notes.md", "version": "v2"})
+    ctx, calls = _patch_httpx(resp)
+    with ctx:
+        await client.write_memory(
+            "ms_1", "notes.md", "hello", expected_version="v1-sha256"
+        )
+    headers = calls[0]["headers"]
+    assert headers.get("If-Match") == "v1-sha256"
+    assert calls[0]["method"] == "PUT"
+    assert calls[0]["json"] == {"content": "hello"}
+
+
+async def test_write_memory_omits_if_match_when_no_version(
+    client: MemoryStoresClient,
+) -> None:
+    resp = _make_response(200, {"path": "notes.md", "version": "v1"})
+    ctx, calls = _patch_httpx(resp)
+    with ctx:
+        await client.write_memory("ms_1", "notes.md", "hello")
+    assert "If-Match" not in calls[0]["headers"]
+
+
+async def test_write_memory_rejects_oversize_content(
+    client: MemoryStoresClient,
+) -> None:
+    payload = "x" * (MAX_MEMORY_FILE_BYTES + 1)
+    with pytest.raises(MemoryFileTooLargeError):
+        await client.write_memory("ms_1", "big.txt", payload)
+
+
+# ---------------------------------------------------------------------------
+# read_memory + redact_version
+# ---------------------------------------------------------------------------
+async def test_read_memory_passes_version_param(client: MemoryStoresClient) -> None:
+    resp = _make_response(200, {"path": "notes.md", "content": "hi", "version": "v7"})
+    ctx, calls = _patch_httpx(resp)
+    with ctx:
+        result = await client.read_memory("ms_1", "notes.md", version="v7")
+    assert result["version"] == "v7"
+    assert calls[0]["params"] == {"version": "v7"}
+
+
+async def test_redact_version_calls_redact_path(client: MemoryStoresClient) -> None:
+    resp = _make_response(204)
+    ctx, calls = _patch_httpx(resp)
+    with ctx:
+        await client.redact_version("ms_1", "ver_42", reason="gdpr request 991")
+    assert calls[0]["method"] == "POST"
+    assert calls[0]["url"].endswith("/v1/memory_stores/ms_1/versions/ver_42/redact")
+    assert calls[0]["json"] == {"reason": "gdpr request 991"}
+
+
+# ---------------------------------------------------------------------------
+# attach_to_session_payload
+# ---------------------------------------------------------------------------
+def test_attach_to_session_payload_rejects_more_than_eight() -> None:
+    too_many = [f"ms_{i}" for i in range(9)]
+    with pytest.raises(MemoryStoresLimitError):
+        MemoryStoresClient.attach_to_session_payload(too_many)
+
+
+def test_attach_to_session_payload_shape() -> None:
+    payload = MemoryStoresClient.attach_to_session_payload(["ms_a", "ms_b"])
+    assert payload == [
+        {"type": "memory_store", "id": "ms_a"},
+        {"type": "memory_store", "id": "ms_b"},
+    ]
+
+
+# ---------------------------------------------------------------------------
+# Error mapping (404 / 409 / 5xx)
+# ---------------------------------------------------------------------------
+async def test_404_maps_to_not_found(client: MemoryStoresClient) -> None:
+    resp = _make_response(
+        404, {"error": {"message": "store not found"}}, text="not found"
+    )
+    ctx, _ = _patch_httpx(resp)
+    with ctx, pytest.raises(MemoryStoresNotFound):
+        await client.get_store("ms_missing")
+
+
+async def test_409_maps_to_conflict(client: MemoryStoresClient) -> None:
+    resp = _make_response(
+        409, {"error": {"message": "version mismatch"}}, text="conflict"
+    )
+    ctx, _ = _patch_httpx(resp)
+    with ctx, pytest.raises(MemoryStoresConflict):
+        await client.write_memory("ms_1", "notes.md", "hi", expected_version="stale")
+
+
+async def test_500_maps_to_base_error(client: MemoryStoresClient) -> None:
+    resp = _make_response(500, {"error": {"message": "boom"}}, text="boom")
+    ctx, _ = _patch_httpx(resp)
+    with ctx, pytest.raises(MemoryStoresError) as info:
+        await client.list_stores()
+    # Make sure we did NOT misclassify the 500 as a more specific subclass.
+    assert not isinstance(info.value, (MemoryStoresNotFound, MemoryStoresConflict))
diff --git a/tests/test_multiagent.py b/tests/test_multiagent.py
new file mode 100644
index 00000000..532a6ea6
--- /dev/null
+++ b/tests/test_multiagent.py
@@ -0,0 +1,230 @@
+"""Tests for the multiagent coordinator helper (v0.32 prep)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from sandcastle.engine.dag import parse_yaml_string, validate
+from sandcastle.engine.multiagent import (
+    MAX_CONCURRENT_THREADS,
+    MAX_ROSTER_SIZE,
+    MultiagentConfig,
+    MultiagentRosterEntry,
+    RosterValidationError,
+    ThreadEvent,
+    build_coordinator_payload,
+    parse_thread_event,
+    validate_roster,
+)
+
+
+# ---------------------------------------------------------------------------
+# MultiagentConfig roster-size validation
+# ---------------------------------------------------------------------------
+
+class TestRosterSize:
+    def test_empty_roster_is_allowed(self):
+        cfg = MultiagentConfig(roster=[])
+        assert cfg.roster == []
+        assert cfg.max_concurrent_threads == 25
+
+    def test_max_size_roster_is_allowed(self):
+        roster = [
+            MultiagentRosterEntry(type="agent", id=f"agent-{i}")
+            for i in range(MAX_ROSTER_SIZE)
+        ]
+        cfg = MultiagentConfig(roster=roster)
+        assert len(cfg.roster) == MAX_ROSTER_SIZE
+
+    def test_oversized_roster_raises(self):
+        roster = [
+            MultiagentRosterEntry(type="agent", id=f"agent-{i}")
+            for i in range(MAX_ROSTER_SIZE + 1)
+        ]
+        with pytest.raises(RosterValidationError) as exc:
+            MultiagentConfig(roster=roster)
+        assert any("exceeds max" in e for e in exc.value.errors)
+
+    def test_max_concurrent_threads_capped(self):
+        with pytest.raises(RosterValidationError):
+            MultiagentConfig(
+                roster=[],
+                max_concurrent_threads=MAX_CONCURRENT_THREADS + 1,
+            )
+
+
+# ---------------------------------------------------------------------------
+# validate_roster - structural checks
+# ---------------------------------------------------------------------------
+
+class TestValidateRoster:
+    def test_missing_id_on_agent_entry(self):
+        errors = validate_roster([
+            MultiagentRosterEntry(type="agent", id=None, nickname="r"),
+        ])
+        assert errors
+        assert any("non-empty 'id'" in e for e in errors)
+
+    def test_two_self_entries_rejected(self):
+        errors = validate_roster([
+            MultiagentRosterEntry(type="self", nickname="me1"),
+            MultiagentRosterEntry(type="self", nickname="me2"),
+        ])
+        assert any("'self' entries" in e for e in errors)
+
+    def test_valid_mixed_roster(self):
+        errors = validate_roster([
+            MultiagentRosterEntry(type="self", nickname="root"),
+            MultiagentRosterEntry(type="agent", id="ag_1", nickname="a"),
+            MultiagentRosterEntry(type="agent", id="ag_2", nickname="b"),
+        ])
+        assert errors == []
+
+
+# ---------------------------------------------------------------------------
+# build_coordinator_payload - wire format
+# ---------------------------------------------------------------------------
+
+class TestBuildPayload:
+    def test_basic_shape(self):
+        cfg = MultiagentConfig(
+            roster=[
+                MultiagentRosterEntry(
+                    type="agent", id="ag_researcher", nickname="researcher"
+                ),
+                MultiagentRosterEntry(
+                    type="agent", id="ag_writer", version=3, nickname="writer"
+                ),
+            ],
+            max_concurrent_threads=4,
+        )
+        payload = build_coordinator_payload(cfg)
+        assert "multiagent" in payload
+        block = payload["multiagent"]
+        assert block["type"] == "coordinator"
+        assert block["max_concurrent_threads"] == 4
+        assert len(block["agents"]) == 2
+        assert block["agents"][0] == {
+            "type": "agent",
+            "id": "ag_researcher",
+            "nickname": "researcher",
+        }
+        assert block["agents"][1] == {
+            "type": "agent",
+            "id": "ag_writer",
+            "version": 3,
+            "nickname": "writer",
+        }
+        assert "prompt_routing_hint" not in block
+
+    def test_with_self_and_hint(self):
+        cfg = MultiagentConfig(
+            roster=[
+                MultiagentRosterEntry(type="self", nickname="root"),
+                MultiagentRosterEntry(
+                    type="agent", id="ag_translator", nickname="translator"
+                ),
+            ],
+            max_concurrent_threads=2,
+            prompt_routing_hint="Route translations through translator.",
+        )
+        payload = build_coordinator_payload(cfg)
+        block = payload["multiagent"]
+        assert block["agents"][0] == {"type": "self", "nickname": "root"}
+        assert block["prompt_routing_hint"] == "Route translations through translator."
+
+
+# ---------------------------------------------------------------------------
+# parse_thread_event - SSE event types
+# ---------------------------------------------------------------------------
+
+class TestParseThreadEvent:
+    def test_thread_message_received(self):
+        ev = parse_thread_event({
+            "type": "agent.thread_message_received",
+            "thread_id": "thr_123",
+            "agent_id": "ag_researcher",
+            "message": {"role": "assistant", "content": "hello"},
+        })
+        assert isinstance(ev, ThreadEvent)
+        assert ev.event_type == "agent.thread_message_received"
+        assert ev.thread_id == "thr_123"
+        assert ev.agent_id == "ag_researcher"
+        assert ev.payload["message"]["role"] == "assistant"
+
+    def test_thread_message_sent(self):
+        ev = parse_thread_event({
+            "type": "agent.thread_message_sent",
+            "thread_id": "thr_124",
+            "agent_id": "ag_writer",
+            "tokens": 42,
+        })
+        assert ev.event_type == "agent.thread_message_sent"
+        assert ev.thread_id == "thr_124"
+        assert ev.agent_id == "ag_writer"
+        assert ev.payload["tokens"] == 42
+
+    def test_session_thread_started_nested_data(self):
+        ev = parse_thread_event({
+            "type": "session.thread_started",
+            "data": {
+                "thread_id": "thr_125",
+                "agent_id": "ag_analyst",
+                "parent_thread_id": "thr_parent",
+            },
+        })
+        assert ev.event_type == "session.thread_started"
+        assert ev.thread_id == "thr_125"
+        assert ev.agent_id == "ag_analyst"
+        assert ev.payload["parent_thread_id"] == "thr_parent"
+
+    def test_session_thread_completed_camelcase(self):
+        ev = parse_thread_event({
+            "type": "session.thread_completed",
+            "threadId": "thr_126",
+            "agentId": "ag_translator",
+            "status": "completed",
+            "duration_ms": 1234,
+        })
+        assert ev.event_type == "session.thread_completed"
+        assert ev.thread_id == "thr_126"
+        assert ev.agent_id == "ag_translator"
+        assert ev.payload["status"] == "completed"
+        assert ev.payload["duration_ms"] == 1234
+
+
+# ---------------------------------------------------------------------------
+# YAML templates parse and validate cleanly
+# ---------------------------------------------------------------------------
+
+TEMPLATE_DIR = (
+    Path(__file__).resolve().parent.parent
+    / "workflows"
+    / "coordinator-templates"
+)
+
+TEMPLATE_FILES = [
+    "research-and-write.yaml",
+    "code-review-and-test.yaml",
+    "analyst-with-translator.yaml",
+]
+
+
+@pytest.mark.parametrize("filename", TEMPLATE_FILES)
+def test_coordinator_template_parses(filename):
+    path = TEMPLATE_DIR / filename
+    assert path.exists(), f"missing template: {path}"
+    wf = parse_yaml_string(path.read_text())
+    assert wf.name
+    assert wf.risk_level == "limited"
+    assert any(s.type == "managed-agent" for s in wf.steps)
+
+
+@pytest.mark.parametrize("filename", TEMPLATE_FILES)
+def test_coordinator_template_validates(filename):
+    path = TEMPLATE_DIR / filename
+    wf = parse_yaml_string(path.read_text())
+    errors = validate(wf)
+    assert errors == [], f"{filename} validation errors: {errors}"
diff --git a/tests/test_outcomes.py b/tests/test_outcomes.py
new file mode 100644
index 00000000..7004eab9
--- /dev/null
+++ b/tests/test_outcomes.py
@@ -0,0 +1,292 @@
+"""Tests for the Anthropic Outcomes API client and composite aggregator.
+
+Uses unittest.mock to patch httpx.AsyncClient so no network calls are made.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+import httpx
+import pytest
+
+from sandcastle.engine.outcomes import (
+    DEFAULT_BETA_HEADER,
+    DEFINE_OUTCOME_EVENT_TYPE,
+    OUTCOME_EVAL_END_TYPE,
+    OUTCOME_EVAL_START_TYPE,
+    AnthropicOutcomesClient,
+    OutcomeDefinition,
+    OutcomeEvaluation,
+    OutcomesAPIError,
+    OutcomeValidationError,
+    aggregate_outcomes,
+    build_define_outcome_event,
+    parse_outcome_evaluation,
+)
+
+
+# ---------------------------------------------------------------------------
+# httpx mocking helpers
+# ---------------------------------------------------------------------------
+def _make_response(
+    status: int = 200,
+    json_body: Any = None,
+    text: str = "",
+) -> MagicMock:
+    resp = MagicMock(spec=httpx.Response)
+    resp.status_code = status
+    resp.text = text or ""
+    if json_body is None:
+        json_body = {}
+    resp.json = MagicMock(return_value=json_body)
+    return resp
+
+
+class _CapturingClient:
+    def __init__(self, calls: list[dict[str, Any]], response: MagicMock) -> None:
+        self._calls = calls
+        self._response = response
+
+    async def __aenter__(self) -> "_CapturingClient":
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb) -> None:
+        return None
+
+    async def request(self, method: str, url: str, **kwargs: Any) -> MagicMock:
+        self._calls.append({"method": method, "url": url, **kwargs})
+        return self._response
+
+
+def _patch_httpx(response: MagicMock) -> tuple[Any, list[dict[str, Any]]]:
+    calls: list[dict[str, Any]] = []
+
+    def _factory(*args: Any, **kwargs: Any) -> _CapturingClient:
+        return _CapturingClient(calls, response)
+
+    return patch("sandcastle.engine.outcomes.httpx.AsyncClient", _factory), calls
+
+
+# ---------------------------------------------------------------------------
+# build_define_outcome_event
+# ---------------------------------------------------------------------------
+def test_build_define_outcome_event_minimal_shape() -> None:
+    definition = OutcomeDefinition(
+        name="answers_question",
+        description="Final response must address the user's question.",
+        success_criteria=["Mentions the topic", "Includes a citation"],
+    )
+    event = build_define_outcome_event(definition)
+    assert event["type"] == DEFINE_OUTCOME_EVENT_TYPE
+    assert event["type"] == "user.define_outcome"
+    outcome = event["outcome"]
+    assert outcome["name"] == "answers_question"
+    assert outcome["description"].startswith("Final response")
+    assert outcome["success_criteria"] == [
+        "Mentions the topic",
+        "Includes a citation",
+    ]
+    assert outcome["weight"] == 1.0
+    # Default model omission keeps the platform free to pick a judge.
+    assert "model" not in outcome
+
+
+def test_build_define_outcome_event_includes_model_override() -> None:
+    definition = OutcomeDefinition(
+        name="safe_output",
+        description="Output must not contain PII.",
+        success_criteria=["No emails", "No phone numbers"],
+        weight=2.5,
+        model="claude-opus-4-7",
+    )
+    event = build_define_outcome_event(definition)
+    assert event["outcome"]["weight"] == 2.5
+    assert event["outcome"]["model"] == "claude-opus-4-7"
+
+
+# ---------------------------------------------------------------------------
+# parse_outcome_evaluation
+# ---------------------------------------------------------------------------
+def test_parse_outcome_evaluation_returns_none_for_wrong_type() -> None:
+    start_event = {
+        "type": OUTCOME_EVAL_START_TYPE,
+        "outcome": {"name": "answers_question"},
+    }
+    assert parse_outcome_evaluation(start_event) is None
+    assert parse_outcome_evaluation({"type": "message.delta"}) is None
+    assert parse_outcome_evaluation({}) is None
+
+
+def test_parse_outcome_evaluation_parses_full_event() -> None:
+    event = {
+        "type": OUTCOME_EVAL_END_TYPE,
+        "outcome": {
+            "name": "answers_question",
+            "passed": True,
+            "score": 0.92,
+            "reasoning": "Cited two sources and answered directly.",
+            "evaluator_model": "claude-sonnet-4-7",
+            "started_at": "2026-05-10T08:00:00Z",
+            "completed_at": "2026-05-10T08:00:04Z",
+            "cost_usd": 0.0123,
+        },
+    }
+    parsed = parse_outcome_evaluation(event)
+    assert isinstance(parsed, OutcomeEvaluation)
+    assert parsed.outcome_name == "answers_question"
+    assert parsed.passed is True
+    assert parsed.score == 0.92
+    assert parsed.reasoning.startswith("Cited")
+    assert parsed.evaluator_model == "claude-sonnet-4-7"
+    assert isinstance(parsed.started_at, datetime)
+    assert isinstance(parsed.completed_at, datetime)
+    assert parsed.cost_usd == 0.0123
+
+
+# ---------------------------------------------------------------------------
+# aggregate_outcomes
+# ---------------------------------------------------------------------------
+def _ev(name: str, passed: bool, cost: float = 0.01) -> OutcomeEvaluation:
+    now = datetime.now(tz=timezone.utc)
+    return OutcomeEvaluation(
+        outcome_name=name,
+        passed=passed,
+        score=1.0 if passed else 0.0,
+        reasoning="",
+        evaluator_model="claude-sonnet-4-7",
+        started_at=now,
+        completed_at=now,
+        cost_usd=cost,
+    )
+
+
+def test_aggregate_outcomes_equal_weights() -> None:
+    evaluations = [
+        _ev("a", True, 0.01),
+        _ev("b", False, 0.02),
+        _ev("c", True, 0.03),
+        _ev("d", True, 0.04),
+    ]
+    result = aggregate_outcomes(evaluations)
+    # 3 of 4 passed with equal weights -> 0.75
+    assert result["composite_score"] == pytest.approx(0.75)
+    assert result["pass_count"] == 3
+    assert result["fail_count"] == 1
+    assert result["total_cost_usd"] == pytest.approx(0.10)
+    assert result["evaluated"] == 4
+
+
+def test_aggregate_outcomes_custom_weights() -> None:
+    evaluations = [
+        _ev("critical", True),
+        _ev("nice_to_have", False),
+    ]
+    result = aggregate_outcomes(
+        evaluations, weights={"critical": 4.0, "nice_to_have": 1.0}
+    )
+    # passed * weight / total_weight = (1 * 4 + 0 * 1) / 5 = 0.8
+    assert result["composite_score"] == pytest.approx(0.8)
+    assert result["weights_used"] == {"critical": 4.0, "nice_to_have": 1.0}
+
+
+def test_aggregate_outcomes_empty_list() -> None:
+    result = aggregate_outcomes([])
+    assert result["composite_score"] == 0.0
+    assert result["pass_count"] == 0
+    assert result["fail_count"] == 0
+    assert result["total_cost_usd"] == 0.0
+    assert result["evaluated"] == 0
+    assert result["weights_used"] == {}
+
+
+# ---------------------------------------------------------------------------
+# OutcomeValidationError
+# ---------------------------------------------------------------------------
+def test_outcome_validation_error_empty_success_criteria() -> None:
+    with pytest.raises(OutcomeValidationError):
+        OutcomeDefinition(
+            name="x",
+            description="d",
+            success_criteria=[],
+        )
+
+
+def test_outcome_validation_error_non_positive_weight() -> None:
+    with pytest.raises(OutcomeValidationError):
+        OutcomeDefinition(
+            name="x",
+            description="d",
+            success_criteria=["ok"],
+            weight=0.0,
+        )
+    with pytest.raises(OutcomeValidationError):
+        OutcomeDefinition(
+            name="x",
+            description="d",
+            success_criteria=["ok"],
+            weight=-1.0,
+        )
+
+
+# ---------------------------------------------------------------------------
+# AnthropicOutcomesClient
+# ---------------------------------------------------------------------------
+@pytest.fixture
+def client() -> AnthropicOutcomesClient:
+    return AnthropicOutcomesClient(api_key="sk-test-outcomes")
+
+
+@pytest.mark.asyncio
+async def test_define_outcome_posts_correct_body_and_beta_header(
+    client: AnthropicOutcomesClient,
+) -> None:
+    response = _make_response(
+        status=200, json_body={"id": "evt_123", "type": "user.define_outcome"}
+    )
+    patcher, calls = _patch_httpx(response)
+    with patcher:
+        definition = OutcomeDefinition(
+            name="answers_question",
+            description="Address the user's question.",
+            success_criteria=["Cites a source"],
+            weight=2.0,
+        )
+        result = await client.define_outcome("sess_abc", definition)
+    assert result == {"id": "evt_123", "type": "user.define_outcome"}
+    assert len(calls) == 1
+    call = calls[0]
+    assert call["method"] == "POST"
+    assert call["url"].endswith("/v1/sessions/sess_abc/events")
+    headers = call["headers"]
+    assert headers["x-api-key"] == "sk-test-outcomes"
+    assert headers["anthropic-beta"] == DEFAULT_BETA_HEADER
+    body = call["json"]
+    assert body["type"] == "user.define_outcome"
+    assert body["outcome"]["name"] == "answers_question"
+    assert body["outcome"]["success_criteria"] == ["Cites a source"]
+    assert body["outcome"]["weight"] == 2.0
+
+
+@pytest.mark.asyncio
+async def test_define_outcome_maps_4xx_to_readable_error(
+    client: AnthropicOutcomesClient,
+) -> None:
+    response = _make_response(
+        status=400,
+        json_body={"error": {"message": "invalid success_criteria"}},
+    )
+    patcher, _calls = _patch_httpx(response)
+    with patcher:
+        definition = OutcomeDefinition(
+            name="x",
+            description="d",
+            success_criteria=["only one"],
+        )
+        with pytest.raises(OutcomesAPIError) as exc_info:
+            await client.define_outcome("sess_abc", definition)
+    msg = str(exc_info.value)
+    assert "400" in msg
+    assert "invalid success_criteria" in msg
diff --git a/tests/test_tool_search.py b/tests/test_tool_search.py
new file mode 100644
index 00000000..f297fdf5
--- /dev/null
+++ b/tests/test_tool_search.py
@@ -0,0 +1,216 @@
+"""Tests for the tool search + tool use examples registry."""
+
+from __future__ import annotations
+
+import pytest
+
+from sandcastle.engine.tool_search import (
+    ToolDefinition,
+    ToolRegistry,
+    validate_tool,
+)
+
+
+# ---------------------------------------------------------------- helpers
+
+
+def _make_tool(
+    name: str,
+    description: str | None = None,
+    tags: list[str] | None = None,
+    defer_loading: bool = False,
+    examples: list[dict] | None = None,
+    parameters: dict | None = None,
+) -> ToolDefinition:
+    return ToolDefinition(
+        name=name,
+        description=description or f"{name} is a sample tool used for exercises in tests.",
+        parameters=parameters
+        or {
+            "type": "object",
+            "properties": {"query": {"type": "string"}},
+            "required": ["query"],
+        },
+        examples=examples
+        or [
+            {"input": {"query": "hello"}, "output": {"result": "world"}},
+        ],
+        defer_loading=defer_loading,
+        tags=tags or [],
+    )
+
+
+# ---------------------------------------------------------------- search
+
+
+def test_search_exact_name_match_ranks_first():
+    reg = ToolRegistry()
+    reg.register(_make_tool("send_email", tags=["email", "smtp"]))
+    reg.register(_make_tool("send_slack_message", tags=["slack", "chat"]))
+
+    hits = reg.search("send_email")
+    assert hits[0].name == "send_email"
+
+
+def test_search_exact_name_beats_partial_token_overlap():
+    reg = ToolRegistry()
+    # Description mentions "email" many times but name does not match exactly.
+    reg.register(
+        _make_tool(
+            "noisy_helper",
+            description="email email email helper for email workflows over email.",
+        )
+    )
+    reg.register(_make_tool("email", tags=["mail"]))
+
+    hits = reg.search("email")
+    assert hits[0].name == "email"
+
+
+def test_search_tag_overlap_ranks_multiple_tools():
+    reg = ToolRegistry()
+    reg.register(_make_tool("alpha", tags=["pdf", "ocr"]))
+    reg.register(_make_tool("beta", tags=["pdf"]))
+    reg.register(_make_tool("gamma", tags=["audio"]))
+
+    hits = reg.search("pdf ocr")
+    names = [t.name for t in hits]
+    assert "alpha" in names and "beta" in names
+    assert names.index("alpha") < names.index("beta")
+    assert "gamma" not in names
+
+
+def test_search_ranks_by_token_overlap_in_tags():
+    reg = ToolRegistry()
+    reg.register(_make_tool("one", tags=["report"]))
+    reg.register(_make_tool("two", tags=["report", "pdf", "charts"]))
+
+    hits = reg.search("report pdf charts")
+    assert hits[0].name == "two"
+
+
+# ---------------------------------------------------------------- partitions
+
+
+def test_hot_tools_excludes_deferred():
+    reg = ToolRegistry()
+    reg.register(_make_tool("fast"))
+    reg.register(_make_tool("rare", defer_loading=True))
+
+    hot = [t.name for t in reg.hot_tools()]
+    assert hot == ["fast"]
+
+
+def test_lazy_tools_only_includes_deferred():
+    reg = ToolRegistry()
+    reg.register(_make_tool("fast"))
+    reg.register(_make_tool("rare", defer_loading=True))
+    reg.register(_make_tool("rare2", defer_loading=True))
+
+    lazy = sorted(t.name for t in reg.lazy_tools())
+    assert lazy == ["rare", "rare2"]
+
+
+# ---------------------------------------------------------------- formatting
+
+
+def test_format_for_agent_produces_anthropic_shape():
+    reg = ToolRegistry()
+    reg.register(_make_tool("search_web"))
+
+    out = ToolRegistry.format_for_agent(reg.all())
+    assert len(out) == 1
+    entry = out[0]
+    assert entry["name"] == "search_web"
+    assert "description" in entry
+    assert entry["input_schema"]["type"] == "object"
+    assert isinstance(entry["examples"], list)
+    assert entry["examples"][0]["input"] == {"query": "hello"}
+
+
+def test_format_for_agent_omits_examples_key_when_none_present():
+    tool = ToolDefinition(
+        name="bare",
+        description="A bare tool with no worked examples baked in at all.",
+        parameters={"type": "object", "properties": {}},
+        examples=[],
+    )
+    out = ToolRegistry.format_for_agent([tool])
+    assert out[0]["name"] == "bare"
+    assert "examples" not in out[0]
+
+
+# ---------------------------------------------------------------- validation
+
+
+def test_validate_rejects_zero_examples():
+    tool = ToolDefinition(
+        name="t",
+        description="A tool with a sufficiently long description for the validator.",
+        parameters={"type": "object", "properties": {}},
+        examples=[],
+    )
+    errors = validate_tool(tool)
+    assert any("1 to 5" in e for e in errors)
+
+
+def test_validate_rejects_more_than_five_examples():
+    tool = _make_tool(
+        "t",
+        examples=[
+            {"input": {"query": f"q{i}"}, "output": {"r": i}} for i in range(6)
+        ],
+    )
+    errors = validate_tool(tool)
+    assert any("max is 5" in e for e in errors)
+
+
+def test_validate_rejects_input_that_violates_parameters_schema():
+    tool = _make_tool(
+        "t",
+        examples=[
+            {"input": {"query": 123}, "output": {"r": "ok"}},  # query must be str
+        ],
+    )
+    errors = validate_tool(tool)
+    assert any("fails parameters schema" in e for e in errors)
+
+
+def test_validate_rejects_too_short_description():
+    tool = _make_tool("t", description="short")
+    errors = validate_tool(tool)
+    assert any("20 characters" in e for e in errors)
+
+
+def test_validate_passes_well_formed_tool():
+    tool = _make_tool("good")
+    assert validate_tool(tool) == []
+
+
+# ---------------------------------------------------------------- misc
+
+
+def test_search_returns_empty_when_registry_empty():
+    reg = ToolRegistry()
+    assert reg.search("anything") == []
+
+
+def test_search_limit_is_respected():
+    reg = ToolRegistry()
+    for i in range(10):
+        reg.register(_make_tool(f"tool_{i}", tags=["common"]))
+
+    hits = reg.search("common", limit=3)
+    assert len(hits) == 3
+
+
+def test_register_replaces_existing_tool_with_same_name():
+    reg = ToolRegistry()
+    reg.register(_make_tool("dup", description="first version of the tool, long enough."))
+    reg.register(_make_tool("dup", description="second version of the tool, long enough."))
+    assert len(reg) == 1
+    assert reg.get("dup").description.startswith("second")
+
+
+if __name__ == "__main__":  # pragma: no cover
+    pytest.main([__file__, "-v"])
diff --git a/tests/test_trajectory_replay.py b/tests/test_trajectory_replay.py
new file mode 100644
index 00000000..bbb1a555
--- /dev/null
+++ b/tests/test_trajectory_replay.py
@@ -0,0 +1,285 @@
+"""Tests for the trajectory replay primitives."""
+
+from __future__ import annotations
+
+from datetime import datetime, timedelta, timezone
+
+import pytest
+
+from sandcastle.engine.trajectory_replay import (
+    ToolCall,
+    Trajectory,
+    compute_trajectory_checksum,
+    diff_trajectories,
+    extract_trajectory,
+    replay_score,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _tc(
+    step_id: str,
+    tool_name: str = "search",
+    args: dict | None = None,
+    output: dict | str | None = None,
+    error: str | None = None,
+    duration_ms: int = 100,
+    ts: datetime | None = None,
+) -> ToolCall:
+    return ToolCall(
+        step_id=step_id,
+        tool_name=tool_name,
+        args=args if args is not None else {"q": "hi"},
+        output=output if output is not None else {"ok": True},
+        error=error,
+        duration_ms=duration_ms,
+        ts=ts or datetime(2026, 1, 1, tzinfo=timezone.utc),
+    )
+
+
+def _traj(tool_calls: list[ToolCall], final_output: dict | None = None) -> Trajectory:
+    t = Trajectory(
+        run_id="run_test",
+        workflow_name="demo",
+        version=1,
+        tool_calls=tool_calls,
+        total_cost_usd=sum(0.001 for _ in tool_calls),
+        total_duration_ms=sum(tc.duration_ms for tc in tool_calls),
+        final_output=final_output if final_output is not None else {"answer": 42},
+    )
+    t.checksum = compute_trajectory_checksum(t)
+    return t
+
+
+# ---------------------------------------------------------------------------
+# compute_trajectory_checksum
+# ---------------------------------------------------------------------------
+
+
+def test_checksum_is_deterministic_for_same_input():
+    t1 = _traj([_tc("a"), _tc("b", tool_name="fetch")])
+    t2 = _traj([_tc("a"), _tc("b", tool_name="fetch")])
+    assert t1.checksum == t2.checksum
+    # Re-compute should also be stable.
+    assert compute_trajectory_checksum(t1) == compute_trajectory_checksum(t2)
+    # SHA-256 hex is 64 chars.
+    assert len(t1.checksum) == 64
+
+
+def test_checksum_changes_when_tool_call_order_changes():
+    a = _tc("a", tool_name="search")
+    b = _tc("b", tool_name="fetch")
+    forward = _traj([a, b])
+    reverse = _traj([b, a])
+    assert forward.checksum != reverse.checksum
+
+
+# ---------------------------------------------------------------------------
+# extract_trajectory
+# ---------------------------------------------------------------------------
+
+
+def test_extract_trajectory_builds_correct_object():
+    t0 = datetime(2026, 5, 1, 12, 0, 0, tzinfo=timezone.utc)
+    audit = [
+        {"event_type": "step.started", "step_id": "s1", "ts": t0},
+        {
+            "event_type": "step.completed",
+            "step_id": "s1",
+            "ts": t0 + timedelta(milliseconds=250),
+        },
+        {
+            "event_type": "step.started",
+            "step_id": "s2",
+            "ts": t0 + timedelta(milliseconds=300),
+        },
+        {
+            "event_type": "step.completed",
+            "step_id": "s2",
+            "ts": t0 + timedelta(milliseconds=800),
+        },
+    ]
+    steps = [
+        {
+            "step_id": "s1",
+            "tool_name": "search",
+            "args": {"q": "hello"},
+            "output": {"hits": 3},
+            "error": None,
+            "cost_usd": 0.002,
+            "workflow_name": "demo_flow",
+            "version": 7,
+        },
+        {
+            "step_id": "s2",
+            "tool_name": "summarize",
+            "args": {"text": "..."},
+            "output": "done",
+            "error": None,
+            "cost_usd": 0.003,
+            "final_output": {"answer": "all good"},
+        },
+    ]
+
+    traj = extract_trajectory("run_xyz", audit, steps)
+
+    assert traj.run_id == "run_xyz"
+    assert traj.workflow_name == "demo_flow"
+    assert traj.version == 7
+    assert [tc.step_id for tc in traj.tool_calls] == ["s1", "s2"]
+    assert traj.tool_calls[0].duration_ms == 250
+    assert traj.tool_calls[1].duration_ms == 500
+    assert traj.total_duration_ms == 750
+    assert traj.total_cost_usd == pytest.approx(0.005)
+    assert traj.final_output == {"answer": "all good"}
+    assert len(traj.checksum) == 64
+
+
+def test_extract_trajectory_uses_audit_order_not_step_order():
+    t0 = datetime(2026, 5, 1, 9, 0, 0, tzinfo=timezone.utc)
+    audit = [
+        {"event_type": "step.started", "step_id": "second", "ts": t0},
+        {
+            "event_type": "step.completed",
+            "step_id": "second",
+            "ts": t0 + timedelta(milliseconds=100),
+        },
+        {
+            "event_type": "step.started",
+            "step_id": "first",
+            "ts": t0 + timedelta(milliseconds=200),
+        },
+        {
+            "event_type": "step.completed",
+            "step_id": "first",
+            "ts": t0 + timedelta(milliseconds=300),
+        },
+    ]
+    steps = [
+        {"step_id": "first", "tool_name": "a", "args": {}, "output": {}, "cost_usd": 0},
+        {"step_id": "second", "tool_name": "b", "args": {}, "output": {}, "cost_usd": 0},
+    ]
+    traj = extract_trajectory("run_2", audit, steps)
+    assert [tc.step_id for tc in traj.tool_calls] == ["second", "first"]
+
+
+# ---------------------------------------------------------------------------
+# diff_trajectories
+# ---------------------------------------------------------------------------
+
+
+def test_diff_detects_added_tool_call():
+    golden = _traj([_tc("a")])
+    candidate = _traj([_tc("a"), _tc("b", tool_name="fetch")])
+    diff = diff_trajectories(golden, candidate)
+    kinds = [d.kind for d in diff.tool_call_diffs]
+    assert "added" in kinds
+    added = [d for d in diff.tool_call_diffs if d.kind == "added"][0]
+    assert added.step_id == "b"
+    assert added.golden is None
+    assert added.candidate is not None
+
+
+def test_diff_detects_removed_tool_call():
+    golden = _traj([_tc("a"), _tc("b", tool_name="fetch")])
+    candidate = _traj([_tc("a")])
+    diff = diff_trajectories(golden, candidate)
+    removed = [d for d in diff.tool_call_diffs if d.kind == "removed"]
+    assert len(removed) == 1
+    assert removed[0].step_id == "b"
+
+
+def test_diff_detects_args_changed():
+    golden = _traj([_tc("a", args={"q": "alpha"})])
+    candidate = _traj([_tc("a", args={"q": "beta"})])
+    diff = diff_trajectories(golden, candidate)
+    kinds = [d.kind for d in diff.tool_call_diffs]
+    assert "args_changed" in kinds
+
+
+def test_diff_detects_output_changed():
+    golden = _traj([_tc("a", output={"hits": 1})])
+    candidate = _traj([_tc("a", output={"hits": 99})])
+    diff = diff_trajectories(golden, candidate)
+    kinds = [d.kind for d in diff.tool_call_diffs]
+    assert "output_changed" in kinds
+
+
+def test_diff_detects_order_changed_when_same_set():
+    a = _tc("a", tool_name="search")
+    b = _tc("b", tool_name="fetch")
+    golden = _traj([a, b])
+    candidate = _traj([b, a])
+    diff = diff_trajectories(golden, candidate)
+    kinds = [d.kind for d in diff.tool_call_diffs]
+    assert "order_changed" in kinds
+    # Same set, no added/removed.
+    assert "added" not in kinds
+    assert "removed" not in kinds
+
+
+def test_diff_zero_when_identical():
+    a = _tc("a", tool_name="search", args={"q": "x"}, output={"ok": True})
+    b = _tc("b", tool_name="fetch", args={"u": "/y"}, output={"data": [1]})
+    golden = _traj([a, b])
+    candidate = _traj([a, b])
+    diff = diff_trajectories(golden, candidate)
+    assert diff.tool_call_diffs == []
+    assert diff.cost_delta_usd == pytest.approx(0.0)
+    assert diff.duration_delta_ms == 0
+    assert diff.final_output_match is True
+    assert "0 tool-call diff" in diff.summary
+
+
+# ---------------------------------------------------------------------------
+# replay_score
+# ---------------------------------------------------------------------------
+
+
+def test_replay_score_returns_one_on_identical():
+    a = _tc("a")
+    b = _tc("b", tool_name="fetch")
+    golden = _traj([a, b])
+    candidate = _traj([a, b])
+    diff = diff_trajectories(golden, candidate)
+    assert replay_score(diff) == pytest.approx(1.0)
+
+
+def test_replay_score_drops_when_output_mismatches():
+    golden = _traj([_tc("a", output={"ok": True})], final_output={"answer": 1})
+    candidate = _traj(
+        [_tc("a", output={"ok": False})],
+        final_output={"answer": 2},
+    )
+    diff = diff_trajectories(golden, candidate)
+    score = replay_score(diff)
+    assert 0.0 <= score < 1.0
+    # With defaults: output_changed -> tool_match = 0.5, final mismatch
+    # -> 0.0, cost within budget -> 1.0. Score = 0.6*0.5 + 0.3*0 + 0.1*1
+    # = 0.4.
+    assert score == pytest.approx(0.4, abs=1e-6)
+
+
+def test_replay_score_respects_custom_weights():
+    golden = _traj([_tc("a")], final_output={"answer": 1})
+    candidate = _traj([_tc("a")], final_output={"answer": 2})
+    diff = diff_trajectories(golden, candidate)
+    # Make final_output dominate, drop tool_match weight.
+    score = replay_score(
+        diff,
+        weights={"tool_match": 0.0, "final_output": 1.0, "cost": 0.0},
+    )
+    # Final output mismatches -> 0 with this weighting.
+    assert score == pytest.approx(0.0)
+
+    # Flip the weighting: tool calls match perfectly, ignore final
+    # output entirely -> score should be 1.0.
+    score_flipped = replay_score(
+        diff,
+        weights={"tool_match": 1.0, "final_output": 0.0, "cost": 0.0},
+    )
+    assert score_flipped == pytest.approx(1.0)
diff --git a/tests/test_v032_wiring.py b/tests/test_v032_wiring.py
new file mode 100644
index 00000000..0ba6fea8
--- /dev/null
+++ b/tests/test_v032_wiring.py
@@ -0,0 +1,595 @@
+"""End-to-end wiring tests for v0.32 prep modules.
+
+These tests verify that the 9 new modules (memory_stores, multiagent,
+agent_webhooks, tool_search, outcomes, trajectory_replay, agent_skills,
+computer_use, agent_sdk_runtime) are wired into the Sandcastle backend.
+
+All HTTP/DB/Anthropic calls are mocked so the suite runs fast (<5s) and
+makes zero outbound network calls.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+
+from sandcastle.api.agent_webhooks import router as agent_webhooks_router
+from sandcastle.engine import executor as _executor_mod
+from sandcastle.engine.dag import (
+    ManagedAgentConfig,
+    StepDefinition,
+    VALID_STEP_TYPES,
+)
+from sandcastle.engine.executor import (
+    RunContext,
+    _execute_computer_use_step,
+    _execute_managed_agent_step,
+    _execute_trajectory_replay_step,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_context(**overrides) -> RunContext:
+    defaults = dict(
+        run_id="run-wiring-1",
+        input={"topic": "x"},
+        step_outputs={},
+        step_results={},
+    )
+    defaults.update(overrides)
+    return RunContext(**defaults)
+
+
+def _mock_sse_stream(events: list[dict]):
+    """Build a stub httpx streaming response that yields SSE lines."""
+    lines = [f"data: {json.dumps(e)}" for e in events]
+    lines.append("")
+
+    class FakeStream:
+        async def aiter_lines(self):
+            for line in lines:
+                yield line
+
+    stream_ctx = AsyncMock()
+    stream_ctx.__aenter__ = AsyncMock(return_value=FakeStream())
+    stream_ctx.__aexit__ = AsyncMock(return_value=False)
+    return stream_ctx
+
+
+def _make_managed_agent_mock(
+    *,
+    captured_session_body: dict | None = None,
+    captured_agent_body: dict | None = None,
+    captured_events_payloads: list[dict] | None = None,
+    sse_events: list[dict] | None = None,
+):
+    """Construct an httpx.AsyncClient mock that records what was sent."""
+
+    mock_client = AsyncMock()
+
+    async def mock_post(url, **kwargs):
+        resp = MagicMock()
+        resp.status_code = 200
+        body = kwargs.get("json", {}) or {}
+        if "/agents" in url and "/sessions" not in url:
+            if captured_agent_body is not None:
+                captured_agent_body.clear()
+                captured_agent_body.update(body)
+            resp.json.return_value = {"id": "ag_xx"}
+        elif "/environments" in url:
+            resp.json.return_value = {"id": "env_xx"}
+        elif "/sessions" in url and "/events" in url:
+            if captured_events_payloads is not None:
+                captured_events_payloads.append(body)
+            resp.json.return_value = {}
+        elif "/sessions" in url:
+            if captured_session_body is not None:
+                captured_session_body.clear()
+                captured_session_body.update(body)
+            resp.json.return_value = {"id": "sess_xx"}
+        else:
+            resp.json.return_value = {}
+        return resp
+
+    mock_client.post = AsyncMock(side_effect=mock_post)
+    mock_client.delete = AsyncMock(return_value=MagicMock(status_code=200))
+    mock_client.stream = MagicMock(
+        return_value=_mock_sse_stream(
+            sse_events
+            or [
+                {
+                    "type": "agent.message",
+                    "content": [{"type": "text", "text": "ok"}],
+                },
+                {"type": "session.status_idle"},
+            ]
+        )
+    )
+    mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+    mock_client.__aexit__ = AsyncMock(return_value=False)
+
+    cleanup = AsyncMock()
+    cleanup.delete = AsyncMock(return_value=MagicMock(status_code=200))
+    cleanup.__aenter__ = AsyncMock(return_value=cleanup)
+    cleanup.__aexit__ = AsyncMock(return_value=False)
+
+    return mock_client, cleanup
+
+
+def _clear_caches():
+    _executor_mod._managed_agent_cache.clear()
+    _executor_mod._managed_env_cache.clear()
+
+
+# ---------------------------------------------------------------------------
+# 1. trajectory-replay step
+# ---------------------------------------------------------------------------
+
+
+class TestTrajectoryReplayStep:
+    """Trajectory-replay step parses + executes against synthetic DB rows."""
+
+    def test_step_type_registered(self):
+        assert "trajectory-replay" in VALID_STEP_TYPES
+
+    @pytest.mark.asyncio
+    async def test_executes_with_mocked_run_data(self):
+        """Mock async_session to yield identical golden + candidate runs.
+
+        With identical data, replay_score == 1.0 and the step passes.
+        """
+        step = StepDefinition(
+            id="tr-1",
+            type="trajectory-replay",
+            trajectory_replay_config={
+                "golden_run_id": "golden-abc",
+                "fail_below_score": 0.5,
+                "allow_cost_delta_pct": 50.0,
+            },
+        )
+        ctx = _make_context(run_id="cand-xyz")
+
+        # Build synthetic SQLAlchemy result rows for both runs.
+        class _StepRow:
+            def __init__(self, sid):
+                self.step_id = sid
+                self.output_data = {"tool_name": "bash", "args": {}}
+                self.error = None
+                self.cost_usd = 0.001
+                self.duration_seconds = 0.5
+                self.started_at = None
+
+        class _Result:
+            def __init__(self, items):
+                self._items = items
+
+            def scalars(self):
+                return self
+
+            def all(self):
+                return self._items
+
+        async def _execute(query):
+            # Inspect the entity type from the compiled SQL via class name.
+            from sandcastle.models.db import AuditEvent, RunStep
+
+            entity = query.column_descriptions[0]["entity"]
+            if entity is RunStep:
+                return _Result([_StepRow("s1"), _StepRow("s2")])
+            if entity is AuditEvent:
+                return _Result([])
+            return _Result([])
+
+        mock_session = AsyncMock()
+        mock_session.execute = AsyncMock(side_effect=_execute)
+        mock_session.__aenter__ = AsyncMock(return_value=mock_session)
+        mock_session.__aexit__ = AsyncMock(return_value=False)
+
+        with patch(
+            "sandcastle.models.db.async_session",
+            return_value=mock_session,
+        ):
+            result = await _execute_trajectory_replay_step(step, ctx)
+
+        assert result.status == "completed", result.error
+        assert result.output["pass"] is True
+        assert result.output["score"] >= 0.5
+        assert result.output["golden_run_id"] == "golden-abc"
+
+
+# ---------------------------------------------------------------------------
+# 2. computer-use step
+# ---------------------------------------------------------------------------
+
+
+class TestComputerUseStep:
+    """Computer-use step parses + executes returning screenshots + actions."""
+
+    def test_step_type_registered(self):
+        assert "computer-use" in VALID_STEP_TYPES
+
+    @pytest.mark.asyncio
+    async def test_executes_and_returns_payload(self):
+        step = StepDefinition(
+            id="cu-1",
+            type="computer-use",
+            computer_use_config={
+                "display_width_px": 1280,
+                "display_height_px": 800,
+                "tools": ["bash", "text_editor", "computer"],
+                "model": "claude-sonnet-4-6",
+                "message": "Open browser",
+            },
+        )
+        ctx = _make_context()
+        result = await _execute_computer_use_step(step, ctx)
+        assert result.status == "completed", result.error
+        assert "screenshots" in result.output
+        assert "actions_taken" in result.output
+        assert isinstance(result.output["screenshots"], list)
+        assert isinstance(result.output["actions_taken"], list)
+        # The Computer Use beta header must be populated.
+        assert result.output["beta_header"].startswith("computer-use-")
+
+
+# ---------------------------------------------------------------------------
+# 3. managed-agent + memory_stores
+# ---------------------------------------------------------------------------
+
+
+class TestManagedAgentMemoryStores:
+    """managed-agent step injects memory_stores into session-create."""
+
+    def setup_method(self):
+        _clear_caches()
+
+    @pytest.mark.asyncio
+    async def test_memory_stores_merged_into_session_resources(self):
+        step = StepDefinition(
+            id="ma-mem",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="ag_existing",
+                environment_id="env_existing",
+                message="hi",
+                memory_stores=["ms_a", "ms_b"],
+            ),
+        )
+        ctx = _make_context()
+
+        captured_session_body: dict = {}
+        mock_client, cleanup = _make_managed_agent_mock(
+            captured_session_body=captured_session_body,
+        )
+
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch(
+                "httpx.AsyncClient",
+                side_effect=[mock_client, cleanup],
+            ):
+                result = await _execute_managed_agent_step(step, ctx)
+
+        assert result.status == "completed", result.error
+        assert "resources" in captured_session_body
+        resources = captured_session_body["resources"]
+        assert {"type": "memory_store", "id": "ms_a"} in resources
+        assert {"type": "memory_store", "id": "ms_b"} in resources
+
+
+# ---------------------------------------------------------------------------
+# 4. managed-agent + multiagent (validation + valid payload)
+# ---------------------------------------------------------------------------
+
+
+class TestManagedAgentMultiagent:
+    """managed-agent step builds + validates multiagent rosters."""
+
+    def setup_method(self):
+        _clear_caches()
+
+    @pytest.mark.asyncio
+    async def test_invalid_roster_fails_step(self):
+        step = StepDefinition(
+            id="ma-mai",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto",
+                message="hi",
+                multiagent={
+                    # Two self entries is invalid per validate_roster.
+                    "roster": [
+                        {"type": "self"},
+                        {"type": "self"},
+                    ],
+                },
+            ),
+        )
+        ctx = _make_context()
+
+        mock_client, cleanup = _make_managed_agent_mock()
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch(
+                "httpx.AsyncClient",
+                side_effect=[mock_client, cleanup],
+            ):
+                result = await _execute_managed_agent_step(step, ctx)
+
+        assert result.status == "failed"
+        assert "multiagent" in result.error.lower() or "self" in result.error.lower()
+
+    @pytest.mark.asyncio
+    async def test_valid_roster_sends_coordinator_payload(self):
+        step = StepDefinition(
+            id="ma-mav",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="auto",
+                message="hi",
+                multiagent={
+                    "roster": [
+                        {"type": "agent", "id": "ag_1", "nickname": "researcher"},
+                        {"type": "agent", "id": "ag_2", "nickname": "writer"},
+                    ],
+                    "max_concurrent_threads": 5,
+                    "prompt_routing_hint": "delegate research to researcher",
+                },
+            ),
+        )
+        ctx = _make_context()
+
+        captured_agent_body: dict = {}
+        mock_client, cleanup = _make_managed_agent_mock(
+            captured_agent_body=captured_agent_body,
+        )
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch(
+                "httpx.AsyncClient",
+                side_effect=[mock_client, cleanup],
+            ):
+                result = await _execute_managed_agent_step(step, ctx)
+
+        assert result.status == "completed", result.error
+        assert "multiagent" in captured_agent_body
+        ma = captured_agent_body["multiagent"]
+        assert ma["type"] == "coordinator"
+        assert ma["max_concurrent_threads"] == 5
+        assert ma["prompt_routing_hint"] == "delegate research to researcher"
+        nicknames = [a.get("nickname") for a in ma["agents"]]
+        assert "researcher" in nicknames
+        assert "writer" in nicknames
+
+
+# ---------------------------------------------------------------------------
+# 5. managed-agent + outcomes
+# ---------------------------------------------------------------------------
+
+
+class TestManagedAgentOutcomes:
+    """managed-agent step POSTs define_outcome events for each outcome."""
+
+    def setup_method(self):
+        _clear_caches()
+
+    @pytest.mark.asyncio
+    async def test_define_outcome_events_posted(self):
+        step = StepDefinition(
+            id="ma-out",
+            type="managed-agent",
+            managed_agent_config=ManagedAgentConfig(
+                agent_id="ag_existing",
+                environment_id="env_existing",
+                message="hi",
+                outcomes=[
+                    {
+                        "name": "accuracy",
+                        "description": "Output is factually correct",
+                        "success_criteria": ["No hallucinations"],
+                        "weight": 2.0,
+                    },
+                    {
+                        "name": "brevity",
+                        "description": "Output is under 100 words",
+                        "success_criteria": ["Under 100 words"],
+                    },
+                ],
+            ),
+        )
+        ctx = _make_context()
+
+        captured_event_payloads: list[dict] = []
+        mock_client, cleanup = _make_managed_agent_mock(
+            captured_events_payloads=captured_event_payloads,
+        )
+
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "k"}):
+            with patch(
+                "httpx.AsyncClient",
+                side_effect=[mock_client, cleanup],
+            ):
+                result = await _execute_managed_agent_step(step, ctx)
+
+        assert result.status == "completed", result.error
+        # Two define_outcome events + one user.message event.
+        define_events = []
+        for payload in captured_event_payloads:
+            for evt in payload.get("events", []):
+                if evt.get("type") == "user.define_outcome":
+                    define_events.append(evt)
+        assert len(define_events) == 2
+        names = {e["outcome"]["name"] for e in define_events}
+        assert names == {"accuracy", "brevity"}
+
+
+# ---------------------------------------------------------------------------
+# 6. agent_webhooks router mounted
+# ---------------------------------------------------------------------------
+
+
+class TestWebhooksMounted:
+    """The agent-webhooks router responds 200 in production after mount."""
+
+    def test_anthropic_webhook_responds_200(self, monkeypatch):
+        import hashlib
+        import hmac
+
+        secret = "wh-secret"
+        monkeypatch.setenv("ANTHROPIC_WEBHOOK_SECRET", secret)
+
+        app = FastAPI()
+        app.include_router(agent_webhooks_router)
+        client = TestClient(app)
+
+        payload = {"type": "session.status_idle", "session_id": "s1"}
+        body = json.dumps(payload).encode()
+        sig = hmac.new(secret.encode(), body, hashlib.sha256).hexdigest()
+
+        resp = client.post(
+            "/agent-webhooks/anthropic",
+            content=body,
+            headers={
+                "X-Anthropic-Signature": sig,
+                "content-type": "application/json",
+            },
+        )
+        assert resp.status_code == 200
+
+
+# ---------------------------------------------------------------------------
+# 7. publish-skills CLI
+# ---------------------------------------------------------------------------
+
+
+class TestPublishSkillsCLI:
+    """`sandcastle publish-skills` lists / uploads workflows-as-skills."""
+
+    def test_dry_run_lists_workflows(self, tmp_path, capsys, monkeypatch):
+        """No --upload: prints JSON, does not call upload()."""
+
+        wf_dir = tmp_path / "wf"
+        wf_dir.mkdir()
+        (wf_dir / "demo.yaml").write_text(
+            "name: demo\n"
+            "description: a demo workflow used for publish-skills tests\n"
+            "default_model: sonnet\n"
+            "steps:\n"
+            "  - id: s1\n"
+            "    prompt: do x\n",
+            encoding="utf-8",
+        )
+
+        from sandcastle.__main__ import _cmd_publish_skills
+
+        args = SimpleNamespace(upload=False, dir=str(wf_dir))
+        _cmd_publish_skills(args)
+
+        captured = capsys.readouterr()
+        results = json.loads(captured.out)
+        assert isinstance(results, list)
+        assert results[0]["status"] == "dry_run"
+        assert results[0]["name"]
+
+    def test_upload_invokes_publish_with_dry_run_false(
+        self, tmp_path, capsys, monkeypatch
+    ):
+        """With --upload: publish_workflows_as_skills(dry_run=False) is called."""
+
+        wf_dir = tmp_path / "wf"
+        wf_dir.mkdir()
+        (wf_dir / "demo.yaml").write_text(
+            "name: demo\n"
+            "description: a demo workflow used for publish-skills tests\n"
+            "default_model: sonnet\n"
+            "steps:\n"
+            "  - id: s1\n"
+            "    prompt: do x\n",
+            encoding="utf-8",
+        )
+
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "k")
+
+        captured_kwargs: dict = {}
+
+        async def fake_publish(*, workflow_dir, dry_run, client):
+            captured_kwargs.update(
+                workflow_dir=workflow_dir,
+                dry_run=dry_run,
+                client=client,
+            )
+            return [{"path": "demo.yaml", "status": "uploaded"}]
+
+        with patch(
+            "sandcastle.engine.agent_skills.publish_workflows_as_skills",
+            new=fake_publish,
+        ):
+            from sandcastle.__main__ import _cmd_publish_skills
+
+            args = SimpleNamespace(upload=True, dir=str(wf_dir))
+            _cmd_publish_skills(args)
+
+        captured = capsys.readouterr()
+        results = json.loads(captured.out)
+        assert results[0]["status"] == "uploaded"
+        assert captured_kwargs["dry_run"] is False
+        assert captured_kwargs["client"] is not None
+
+
+# ---------------------------------------------------------------------------
+# 8. agent_runtime dispatch for "agent-sdk" + unknown runtimes
+# ---------------------------------------------------------------------------
+
+
+class TestAgentRuntimeDispatch:
+    """get_runtime('agent-sdk') routes to AgentSDKRunner."""
+
+    @pytest.mark.asyncio
+    async def test_agent_sdk_dispatch_calls_runner(self):
+        from sandcastle.engine import agent_runtime as ar_mod
+        from sandcastle.engine.agent_runtime import get_runtime
+        from sandcastle.engine.agent_sdk_runtime import AgentSDKResult
+
+        runtime = get_runtime("agent-sdk")
+        assert runtime.name == "agent-sdk"
+
+        async def fake_run(self, prompt, config):  # noqa: ARG001
+            return AgentSDKResult(
+                output="hello from sdk",
+                tool_calls=[],
+                cost_usd=0.001,
+                duration_ms=42,
+            )
+
+        with patch(
+            "sandcastle.engine.agent_sdk_runtime.AgentSDKRunner.run",
+            new=fake_run,
+        ):
+            result = await runtime.execute(
+                system_prompt="be brief",
+                tools=[],
+                packages=[],
+                message="hi",
+                model="claude-sonnet-4-6",
+                timeout=30,
+                network="unrestricted",
+            )
+
+        assert result["output"] == "hello from sdk"
+        assert result["runtime"] == "agent-sdk"
+        assert result["cost_usd"] == 0.001
+        assert result["duration_ms"] == 42
+
+    def test_unknown_runtime_raises(self):
+        from sandcastle.engine.agent_runtime import get_runtime
+
+        with pytest.raises(ValueError):
+            get_runtime("not-a-real-runtime")
diff --git a/workflows/coordinator-templates/analyst-with-translator.yaml b/workflows/coordinator-templates/analyst-with-translator.yaml
new file mode 100644
index 00000000..d4207cdc
--- /dev/null
+++ b/workflows/coordinator-templates/analyst-with-translator.yaml
@@ -0,0 +1,56 @@
+name: analyst-with-translator-coordinator
+description: >
+  Coordinator workflow that runs a data analysis then fans out translation
+  of the result into 3 target languages in parallel. Analyst performs the
+  analysis once; translator is delegated three times concurrently. Uses
+  the v0.32 multiagent preview (managed-agents-2026-04-01).
+version: "0.32.0-preview"
+risk_level: limited
+
+input_schema:
+  required: [dataset_url]
+  properties:
+    dataset_url:
+      type: string
+      description: URL of the dataset to analyze
+    target_languages:
+      type: array
+      description: Three target language codes (e.g. ["cs", "de", "fr"])
+
+steps:
+  - id: coordinator
+    type: managed-agent
+    managed_agent_config:
+      agent_id: auto
+      agent_template: analyst
+      message: >
+        Analyze {input.dataset_url}, then translate the executive summary
+        into each of {input.target_languages} (exactly 3 languages, run
+        the three translations concurrently).
+      model: claude-sonnet-4-6
+      timeout: 1200
+      tools_enabled:
+        - bash
+        - web_search
+        - multiagent_delegate
+        - multiagent_collect
+      multiagent:
+        type: coordinator
+        max_concurrent_threads: 4
+        prompt_routing_hint: >
+          Run analyst once for the data work. Then dispatch three
+          translator threads in parallel, one per language code.
+        roster:
+          - type: agent
+            id: analyst
+            nickname: analyst
+          - type: agent
+            id: translator
+            nickname: translator
+
+  - id: bundle
+    type: standard
+    depends_on: [coordinator]
+    prompt: >
+      Bundle {steps.coordinator.output} into a single JSON object keyed by
+      language code, with the source analysis at key 'source'.
diff --git a/workflows/coordinator-templates/code-review-and-test.yaml b/workflows/coordinator-templates/code-review-and-test.yaml
new file mode 100644
index 00000000..2d35abd9
--- /dev/null
+++ b/workflows/coordinator-templates/code-review-and-test.yaml
@@ -0,0 +1,57 @@
+name: code-review-and-test-coordinator
+description: >
+  Coordinator workflow that audits a pull request and produces pytest cases
+  in parallel. Reviewer scans the diff for issues; tester writes
+  regression tests covering the touched code paths. Uses the v0.32
+  multiagent preview (managed-agents-2026-04-01).
+version: "0.32.0-preview"
+risk_level: limited
+
+input_schema:
+  required: [pr_url]
+  properties:
+    pr_url:
+      type: string
+      description: URL of the pull request to review
+    repo_path:
+      type: string
+      description: Local checkout path of the repository
+
+steps:
+  - id: coordinator
+    type: managed-agent
+    managed_agent_config:
+      agent_id: auto
+      agent_template: reviewer
+      message: >
+        Review PR {input.pr_url} (checkout at {input.repo_path}).
+        Delegate the audit to reviewer and pytest authoring to tester.
+        Return a JSON object with keys 'review' and 'tests'.
+      model: claude-sonnet-4-6
+      timeout: 1200
+      tools_enabled:
+        - bash
+        - file_read
+        - file_write
+        - multiagent_delegate
+        - multiagent_collect
+      multiagent:
+        type: coordinator
+        max_concurrent_threads: 2
+        prompt_routing_hint: >
+          Send the diff to reviewer for an audit and to tester for
+          pytest-compatible regression tests. Run both in parallel.
+        roster:
+          - type: agent
+            id: reviewer
+            nickname: reviewer
+          - type: agent
+            id: tester
+            nickname: tester
+
+  - id: write_report
+    type: standard
+    depends_on: [coordinator]
+    prompt: >
+      Combine {steps.coordinator.output} into a markdown report with two
+      sections: "Review findings" and "New pytest cases".
diff --git a/workflows/coordinator-templates/research-and-write.yaml b/workflows/coordinator-templates/research-and-write.yaml
new file mode 100644
index 00000000..22bf14b9
--- /dev/null
+++ b/workflows/coordinator-templates/research-and-write.yaml
@@ -0,0 +1,55 @@
+name: research-and-write-coordinator
+description: >
+  Coordinator workflow that delegates research and writing to two managed
+  agents in parallel. Researcher gathers facts about the topic; writer
+  composes the final piece using those facts. Uses the v0.32 multiagent
+  preview (managed-agents-2026-04-01).
+version: "0.32.0-preview"
+risk_level: limited
+
+input_schema:
+  required: [topic]
+  properties:
+    topic:
+      type: string
+      description: Subject to research and write about
+    length:
+      type: string
+      description: Desired length of the output (short, medium, long)
+
+steps:
+  - id: coordinator
+    type: managed-agent
+    managed_agent_config:
+      agent_id: auto
+      agent_template: researcher
+      message: >
+        Topic: {input.topic}
+        Length: {input.length}
+        Delegate fact gathering to researcher and composition to writer.
+      model: claude-sonnet-4-6
+      timeout: 900
+      tools_enabled:
+        - web_search
+        - bash
+        - multiagent_delegate
+        - multiagent_collect
+      multiagent:
+        type: coordinator
+        max_concurrent_threads: 4
+        prompt_routing_hint: >
+          Use researcher for fact gathering, writer for prose composition.
+        roster:
+          - type: agent
+            id: researcher
+            nickname: researcher
+          - type: agent
+            id: writer
+            nickname: writer
+
+  - id: finalize
+    type: standard
+    depends_on: [coordinator]
+    prompt: >
+      Format the coordinator output {steps.coordinator.output} as a clean
+      markdown document with a title, summary, and citations section.