From e3095a75602f57213a7f0bff60b36f336bd68a91 Mon Sep 17 00:00:00 2001 From: donalddellapietra Date: Sun, 7 Jun 2026 20:45:51 -0400 Subject: [PATCH 1/7] fix(integrations): fail closed on redirect_to_safe in non-substituting adapters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A redirect_to_safe violation returns action="redirected" with allowed=True (intentional: LangGraph substitutes the safe tool and the agent flow continues). But every adapter except LangGraph gated tool execution on `if check.blocked`, which is False on a redirect — so the guard rolled the unsafe call out of the trace and then the adapter ran the original unsafe tool anyway. A safety control silently degraded to a no-op on agents/crewai/vercel_ai/claude_agent/google_adk/mcp. Add CheckResult.stop_original (blocked OR redirected) and gate the "run the original tool" decision on it in every adapter that cannot transparently substitute. Those adapters now fail *closed* (refuse the unsafe call) instead of fail-open. LangGraph is unchanged: it branches on .redirected first and performs the substitution. allowed/blocked/redirected semantics are unchanged, so the existing CheckResult-level tests still hold. Adds a regression test asserting stop_original is True on redirect and False on a clean pass. Co-Authored-By: Claude Opus 4.8 --- sponsio/integrations/agents.py | 12 ++++++++++-- sponsio/integrations/base.py | 21 +++++++++++++++++++++ sponsio/integrations/claude_agent.py | 6 +++++- sponsio/integrations/crewai.py | 8 ++++++-- sponsio/integrations/google_adk.py | 3 ++- sponsio/integrations/mcp.py | 12 ++++++++---- sponsio/integrations/vercel_ai.py | 4 +++- tests/test_redirect_to_safe.py | 28 ++++++++++++++++++++++++++++ 8 files changed, 83 insertions(+), 11 deletions(-) diff --git a/sponsio/integrations/agents.py b/sponsio/integrations/agents.py index 8d3ffc9..02fe563 100644 --- a/sponsio/integrations/agents.py +++ b/sponsio/integrations/agents.py @@ -130,7 +130,11 @@ def wrap_tool(self, tool: Any) -> Any: async def guarded_async(*args: Any, **kwargs: Any) -> Any: check = guard.guard_before(tool_name, kwargs) guard.last_check = check - if check.blocked: + # ``stop_original`` folds in ``redirected``: this adapter + # does not implement transparent tool substitution, so a + # ``redirect_to_safe`` redirect fails closed (refuse) + # rather than running the unsafe call. + if check.stop_original: msg = select_agent_message( check.det_violations, fallback="Contract violation" ) @@ -151,7 +155,11 @@ async def guarded_async(*args: Any, **kwargs: Any) -> Any: def guarded_sync(*args: Any, **kwargs: Any) -> Any: check = guard.guard_before(tool_name, kwargs) guard.last_check = check - if check.blocked: + # ``stop_original`` folds in ``redirected``: this adapter + # does not implement transparent tool substitution, so a + # ``redirect_to_safe`` redirect fails closed (refuse) + # rather than running the unsafe call. + if check.stop_original: msg = select_agent_message( check.det_violations, fallback="Contract violation" ) diff --git a/sponsio/integrations/base.py b/sponsio/integrations/base.py index 258e2da..1268043 100644 --- a/sponsio/integrations/base.py +++ b/sponsio/integrations/base.py @@ -313,6 +313,27 @@ def redirected(self) -> bool: """True if any det violation returned a redirect outcome.""" return any(r.action == "redirected" for r in self.det_violations) + @property + def stop_original(self) -> bool: + """True when the adapter must NOT execute the original tool call. + + Folds hard blocks together with redirects. A ``redirect_to_safe`` + violation rolls the original ``unsafe`` call out of the trace and + sets ``redirected_to``; an adapter that runs the original call + anyway would execute the exact action the contract forbade — a + fail-*open* hole, the worst outcome for an enforcement layer. + + Adapters that implement transparent substitution (LangGraph) + MUST branch on ``redirected`` / ``redirected_to`` *first* and + invoke the safe tool. Adapters that don't (yet) support + substitution MUST gate execution on ``stop_original`` so a + redirect fails *closed* (the unsafe call is refused) instead of + falling through to ``if check.blocked``, which is False on a + redirect. ``escalated`` is intentionally excluded — see + ``guard_before`` for why escalation does not gate execution. + """ + return self.blocked or self.redirected + @property def needs_retry(self) -> bool: """True if any sto violation returned a retry with feedback.""" diff --git a/sponsio/integrations/claude_agent.py b/sponsio/integrations/claude_agent.py index 0a64ec4..a2dd042 100644 --- a/sponsio/integrations/claude_agent.py +++ b/sponsio/integrations/claude_agent.py @@ -112,7 +112,11 @@ async def pre_tool_hook( check = guard.guard_before(tool_name, tool_input) guard.last_check = check - if check.blocked: + # ``stop_original`` folds in ``redirected``: this hook denies + # via the SDK permission system and has no substitution path, + # so a redirect fails closed (denied) rather than running the + # unsafe call. + if check.stop_original: # Prefer the structured ``agent_msg`` from OutcomeBuilder # — it's already phrased to steer the model toward # abandoning this action. Falls back to the legacy diff --git a/sponsio/integrations/crewai.py b/sponsio/integrations/crewai.py index db5c458..31efef3 100644 --- a/sponsio/integrations/crewai.py +++ b/sponsio/integrations/crewai.py @@ -102,7 +102,10 @@ def on_tool_start(self, context: Any) -> Any: ) self.last_check = check - if check.blocked: + # ``stop_original`` folds in ``redirected``: CrewAI's adapter has + # no transparent-substitution path, so a redirect fails closed + # (returns the rejection) rather than executing the unsafe tool. + if check.stop_original: msg = select_agent_message( check.det_violations, fallback="Contract violation detected" ) @@ -186,7 +189,8 @@ def make_guarded(orig: Any, name: str): def guarded(*args: Any, **kwargs: Any) -> Any: call_args = kwargs if kwargs else {"args": list(args)} check = guard.guard_before(name, call_args) - if check.blocked: + # Fail closed on redirect too (no substitution path here). + if check.stop_original: msg = select_agent_message( check.det_violations, fallback="contract violated" ) diff --git a/sponsio/integrations/google_adk.py b/sponsio/integrations/google_adk.py index 300ada1..6f8a103 100644 --- a/sponsio/integrations/google_adk.py +++ b/sponsio/integrations/google_adk.py @@ -93,7 +93,8 @@ def wrap_tool(self, tool: Callable[..., Any]) -> Callable[..., Any]: async def guarded_async(*args: Any, **kwargs: Any) -> Any: check = guard.guard_before(tool_name, _call_args(tool, args, kwargs)) guard.last_check = check - if check.blocked: + # Fail closed on redirect too (no substitution path here). + if check.stop_original: return _blocked_result(check) result = await tool(*args, **kwargs) diff --git a/sponsio/integrations/mcp.py b/sponsio/integrations/mcp.py index fede799..2b1798b 100644 --- a/sponsio/integrations/mcp.py +++ b/sponsio/integrations/mcp.py @@ -142,13 +142,17 @@ async def call_tool(self, tool_name: str, arguments: dict | None = None) -> dict # that proxy this to an LLM (Claude Desktop, custom orchestrators) # can show the agent-tuned phrasing while keeping the legacy # ``violations`` array of log-formatted strings for back-compat. - blocked = [r for r in results if r.action == "blocked"] - if blocked: + # Treat ``redirected`` the same as ``blocked`` here: this proxy + # has no transparent-substitution path, so a ``redirect_to_safe`` + # redirect must refuse the unsafe call rather than fall through + # and execute it (a fail-open hole). + stopped = [r for r in results if r.action in ("blocked", "redirected")] + if stopped: return { "error": "Blocked by behavioral contract", - "violations": [r.message for r in blocked], + "violations": [r.message for r in stopped], "agent_messages": [ - r.agent_msg for r in blocked if getattr(r, "agent_msg", "") + r.agent_msg for r in stopped if getattr(r, "agent_msg", "") ], } diff --git a/sponsio/integrations/vercel_ai.py b/sponsio/integrations/vercel_ai.py index 3ec36e7..299cde8 100644 --- a/sponsio/integrations/vercel_ai.py +++ b/sponsio/integrations/vercel_ai.py @@ -120,7 +120,9 @@ async def wrap_tool(self, call: Any, next_fn: Any) -> Any: check = guard.guard_before(tool_name, kwargs) guard.last_check = check - if check.blocked: + # ``stop_original`` folds in ``redirected``: no transparent + # substitution path here, so a redirect fails closed. + if check.stop_original: msg = select_agent_message( check.det_violations, fallback="Contract violation" ) diff --git a/tests/test_redirect_to_safe.py b/tests/test_redirect_to_safe.py index 0d39243..c642e55 100644 --- a/tests/test_redirect_to_safe.py +++ b/tests/test_redirect_to_safe.py @@ -96,6 +96,34 @@ def test_unconditional_redirect_fires_on_first_call(self) -> None: # ``allowed`` stays True so adapters know the agent flow can # continue (with the substituted tool, not the original). assert result.allowed is True + # ...but ``stop_original`` is True so an adapter that cannot + # substitute fails *closed* (refuses the unsafe call) instead of + # reading ``allowed``/``blocked`` and running the original. + assert result.stop_original is True + + def test_stop_original_fail_closed_contract(self) -> None: + """A redirect must never let the original ``unsafe`` call run. + + ``blocked`` is False on a redirect (by design), so any adapter + that gated execution on ``if check.blocked`` alone would + fail open. ``stop_original`` is the safe gate: True for both + blocks and redirects, False on a clean pass. + """ + guard = Sponsio( + agent_id="bot", + contracts=[ + contract("redirect rm to trash").guarantees( + redirect_to_safe("rm_rf", "trash") + ) + ], + mode="enforce", + verbose=False, + ) + redirected = guard.guard_before("rm_rf", {"path": "/tmp/x"}) + assert redirected.stop_original is True + # A clean call does not stop. + clean = guard.guard_before("read_file", {"path": "/tmp/x"}) + assert clean.stop_original is False def test_other_tools_pass_through(self) -> None: guard = Sponsio( From 820e5e7e33b8ceaa606dc8ab11cc039e5de72a07 Mon Sep 17 00:00:00 2001 From: donalddellapietra Date: Sun, 7 Jun 2026 20:48:39 -0400 Subject: [PATCH 2/7] fix(ts/sdk): align Eq with Python value-equality for composite values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Python evaluator compares Eq operands with `left == right` (value equality); the TS evaluator used `l === r`, which compares arrays and objects by reference. The v0.2 Term abstraction makes value-equality reachable via `Eq(ArgValue(...), CtxValue(...))`, so a list- or object-valued arg that is equal-by-value (`[1,2] == [1,2]` → True in Python) compared False in TS on the same trace — a cross-language divergence on the deterministic core. Replace the `===` eq path with a `valuesEqual` deep structural comparison (element-wise for arrays, key-wise for objects, identity for primitives). le/lt/ge/gt are unchanged. Adds a parity regression test. Co-Authored-By: Claude Opus 4.8 --- ts/packages/sdk/src/__tests__/parity.test.ts | 52 ++++++++++++++++++++ ts/packages/sdk/src/core/evaluator.ts | 46 ++++++++++++++++- 2 files changed, 96 insertions(+), 2 deletions(-) diff --git a/ts/packages/sdk/src/__tests__/parity.test.ts b/ts/packages/sdk/src/__tests__/parity.test.ts index 8ef732f..c2bd69f 100644 --- a/ts/packages/sdk/src/__tests__/parity.test.ts +++ b/ts/packages/sdk/src/__tests__/parity.test.ts @@ -16,6 +16,7 @@ import { Atom, G, Implies, X, F, And, type Valuation, } from "../index.js"; +import { Eq, ArgValue, CtxValue, Const, predKey } from "../core/formula.js"; import { deadline, requiredStepsCompletion, @@ -348,12 +349,63 @@ function testDegeneratePatternRejection() { } } +// ───────────────────────────────────────────────────────────────────────── +// Eq value-equality parity (composite values) +// ───────────────────────────────────────────────────────────────────────── +// +// Python ``_safe_compare`` eq uses ``left == right``; the TS evaluator +// used ``l === r``, which compares arrays/objects by reference. With the +// Term abstraction, ``Eq(ArgValue(...), CtxValue(...))`` over list-valued +// args could pass in Python (``[1,2] == [1,2]`` is True) and fail in TS. +// ``valuesEqual`` now closes the gap with deep comparison. +function testEqValueEquality() { + console.log("[Eq value-equality parity]"); + + const key = (p: string, ...a: string[]) => predKey(p, ...a); + const eqArgCtx = new Eq( + new ArgValue("book", "seats"), + new CtxValue("expected_seats"), + ); + + // Equal-by-value arrays: Python True, TS now True (was False). + assert( + evaluate(eqArgCtx, [ + { + [key("arg_value", "book", "seats")]: [1, 2, 3], + [key("ctx_value", "expected_seats")]: [1, 2, 3], + }, + ] as unknown as Valuation[]) === true, + "Eq over equal arrays is True (deep equality)", + ); + + // Different arrays: False. + assert( + evaluate(eqArgCtx, [ + { + [key("arg_value", "book", "seats")]: [1, 2, 3], + [key("ctx_value", "expected_seats")]: [1, 2], + }, + ] as unknown as Valuation[]) === false, + "Eq over unequal arrays is False", + ); + + // Scalar equality still works. + const eqScalar = new Eq(new ArgValue("pay", "amount"), new Const(50)); + assert( + evaluate(eqScalar, [ + { [key("arg_value", "pay", "amount")]: 50 }, + ] as unknown as Valuation[]) === true, + "Eq over equal scalars is True", + ); +} + console.log("=== TS↔Python Parity Regression Tests ===\n"); testBoundedEventuallyDeadline(); testRequiredStepsCompletion(); testGuardBeforeRollback(); testDeadlineNlParity(); testDegeneratePatternRejection(); +testEqValueEquality(); console.log(`\n${"=".repeat(40)}`); console.log(`Results: ${passed} passed, ${failed} failed`); diff --git a/ts/packages/sdk/src/core/evaluator.ts b/ts/packages/sdk/src/core/evaluator.ts index f00b492..fee756d 100644 --- a/ts/packages/sdk/src/core/evaluator.ts +++ b/ts/packages/sdk/src/core/evaluator.ts @@ -76,12 +76,54 @@ function resolveArith(expr: Term, state: Valuation): unknown { } } +/** + * Structural value-equality, matching Python's `==` for the value + * shapes that flow through grounding (numbers, strings, booleans, + * arrays, plain objects). + * + * The naive `l === r` diverged from the Python evaluator (`left == + * right`) for composite values: an `Eq(ArgValue(...), CtxValue(...))` + * over list- or object-valued args compares by *value* in Python + * (`[1] == [1]` is True) but `===` compares arrays/objects by + * *reference* in JS (`[1] === [1]` is False). With the v0.2 Term + * abstraction making value-equality reachable, that gap could pass a + * contract in Python and fail it in TS on the same trace. `valuesEqual` + * closes it with element-/key-wise deep comparison. + */ +function valuesEqual(a: unknown, b: unknown): boolean { + if (a === b) return true; + if (a === null || b === null || a === undefined || b === undefined) { + return false; + } + if (Array.isArray(a) || Array.isArray(b)) { + if (!Array.isArray(a) || !Array.isArray(b) || a.length !== b.length) { + return false; + } + return a.every((x, i) => valuesEqual(x, b[i])); + } + if (typeof a === "object" && typeof b === "object") { + const ka = Object.keys(a as object); + const kb = Object.keys(b as object); + if (ka.length !== kb.length) return false; + return ka.every( + (k) => + Object.prototype.hasOwnProperty.call(b, k) && + valuesEqual( + (a as Record)[k], + (b as Record)[k], + ), + ); + } + return false; +} + /** * Compare two resolved values with the canonical "missing" semantics. * * If either operand is undefined / null, the comparison is False (the * comparison cannot decide). Same for type errors (mismatched types). - * This is the Hoare-vacuity convention. + * This is the Hoare-vacuity convention. `eq` uses `valuesEqual` for + * Python `==` parity on composite values. */ function safeCompare(op: string, left: unknown, right: unknown): boolean { if (left === undefined || left === null) return false; @@ -96,7 +138,7 @@ function safeCompare(op: string, left: unknown, right: unknown): boolean { case "lt": return l < r; case "ge": return l >= r; case "gt": return l > r; - case "eq": return l === r; + case "eq": return valuesEqual(l, r); } } catch { return false; From 46d438ddfa41a351191bd856c01b85b7d763bf4d Mon Sep 17 00:00:00 2001 From: donalddellapietra Date: Sun, 7 Jun 2026 20:49:46 -0400 Subject: [PATCH 3/7] fix(ts/sdk): guard createRequire for non-Node runtimes (Cloudflare Workers) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @sponsio/sdk built modules call `createRequire(import.meta.url)` at the top level of config-loader and pack-loader to lazily pull in the optional `yaml` package. On Cloudflare Workers (and Deno Deploy) `import.meta.url` is `undefined`, so `createRequire(undefined)` throws at module-evaluation time and takes the entire Worker bundle down — even when the app never loads YAML config or packs. This is the crash the sponsio-demo repo worked around with a patch-package patch against the shipped dist (`createRequire(import.meta.url ?? "file:///sponsio-noop.js")`). Upstream the fix to source, but more elegantly than the dist patch: build the require lazily on first use (memoized getRequireCjs) instead of eagerly at import. A Worker bundle that never touches YAML now never calls createRequire at all; the `?? "file:///sponsio-noop.js"` fallback keeps it from throwing in the rare case it is reached on such a runtime. This lets the demo drop its patch-package patch once it bumps the SDK. Co-Authored-By: Claude Opus 4.8 --- ts/packages/sdk/src/core/config-loader.ts | 18 ++++++++++++++++-- ts/packages/sdk/src/core/pack-loader.ts | 16 ++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/ts/packages/sdk/src/core/config-loader.ts b/ts/packages/sdk/src/core/config-loader.ts index 29aadaa..4048c43 100644 --- a/ts/packages/sdk/src/core/config-loader.ts +++ b/ts/packages/sdk/src/core/config-loader.ts @@ -43,7 +43,21 @@ import { dirname, resolve as resolvePath } from "node:path"; // Use createRequire so we can lazily load the optional `yaml` package // without breaking non-yaml users (ESM dynamic import would force the // entire constructor path async, which we don't want). -const requireCjs = createRequire(import.meta.url); +// +// Built lazily on first use rather than at module load. On runtimes +// where the SDK is bundled but YAML config is never used (Cloudflare +// Workers, Deno Deploy), `import.meta.url` is `undefined` and an eager +// `createRequire(undefined)` throws at import time, taking the whole +// bundle down. Deferring the call means a Worker that never loads YAML +// never touches `createRequire`; the `?? noop` guard keeps it from +// throwing in the rare case it is reached on such a runtime. +let _requireCjs: ReturnType | null = null; +function getRequireCjs(): ReturnType { + if (_requireCjs === null) { + _requireCjs = createRequire(import.meta.url ?? "file:///sponsio-noop.js"); + } + return _requireCjs; +} export interface LoadedConfig { /** @@ -114,7 +128,7 @@ type YamlLib = { function loadYamlLib(): YamlLib { try { - return requireCjs("yaml") as YamlLib; + return getRequireCjs()("yaml") as YamlLib; } catch { throw new Error( "[sponsio] config loading requires the `yaml` package. " + diff --git a/ts/packages/sdk/src/core/pack-loader.ts b/ts/packages/sdk/src/core/pack-loader.ts index 58b1c88..b99f6d1 100644 --- a/ts/packages/sdk/src/core/pack-loader.ts +++ b/ts/packages/sdk/src/core/pack-loader.ts @@ -26,14 +26,26 @@ import { createRequire } from "node:module"; import { fileURLToPath } from "node:url"; import type { SkippedItem } from "./config-loader.js"; -const requireCjs = createRequire(import.meta.url); +// Lazily built on first use, with a fallback URL. On runtimes where +// `import.meta.url` is `undefined` (Cloudflare Workers, Deno Deploy), +// an eager `createRequire(undefined)` throws at module load and breaks +// the whole bundle even when YAML packs are never loaded. Deferring the +// call keeps Worker bundles that never touch packs from hitting it; the +// `?? noop` guard avoids the throw if it is reached. See config-loader. +let _requireCjs: ReturnType | null = null; +function getRequireCjs(): ReturnType { + if (_requireCjs === null) { + _requireCjs = createRequire(import.meta.url ?? "file:///sponsio-noop.js"); + } + return _requireCjs; +} interface YamlLib { parse: (src: string) => unknown; } function loadYamlLib(): YamlLib { - return requireCjs("yaml") as YamlLib; + return getRequireCjs()("yaml") as YamlLib; } /** From 477b819d87a6af9a62cb0d8252b1c14bd43d263d Mon Sep 17 00:00:00 2001 From: donalddellapietra Date: Sun, 7 Jun 2026 20:51:09 -0400 Subject: [PATCH 4/7] docs(core): document filter_tools cost, workflow_step batch caveat, and footguns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Doc/comment-only clarifications from the v0.2 review (no behavior change): * filter_tools: note it is O(candidates × trace_length) per call — rollback_last_event resets the verifier so each probe re-grounds the whole trace — and to call it once per turn + bound the trace with rotate_session. (Snapshot/restore fast path tracked as follow-up.) * workflow_step: document the end-of-trace weak-next vacuity — a trigger on the final event of a batch-verified trace incurs no violation (self-corrects in live enforce mode; matters only for verify/replay). * evaluator._warned_missing_vars: note it is process-global, lock-free, one-shot, and does not re-fire across sessions/tests. * Var: warn that ==/<=/< etc. build comparison AST nodes (not bools), so == does not value-compare two Var instances (hashing still works). * grounding arg_value: note raw arg values are retained for the whole trace; bound with rotate_session for large payloads. Co-Authored-By: Claude Opus 4.8 --- sponsio/formulas/evaluator.py | 8 ++++++++ sponsio/formulas/formula.py | 9 +++++++++ sponsio/integrations/base.py | 11 +++++++++++ sponsio/patterns/library.py | 11 +++++++++++ sponsio/tracer/grounding.py | 7 +++++++ 5 files changed, 46 insertions(+) diff --git a/sponsio/formulas/evaluator.py b/sponsio/formulas/evaluator.py index 8dca0df..ad579c1 100644 --- a/sponsio/formulas/evaluator.py +++ b/sponsio/formulas/evaluator.py @@ -72,6 +72,14 @@ } ) +# Process-global, append-only, lock-free. It backs "warn at most once +# per missing Var key for the lifetime of the process" — deliberately +# coarse: the warning is a developer aid, not session state, so a benign +# data race on ``add`` (which never loses the warning, only its exact +# ordering) is acceptable and not worth taking the monitor's RLock for. +# Consequence: warnings do not re-fire across sessions or test cases in +# the same interpreter. Tests that assert on the warning should reset +# this set in a fixture. _warned_missing_vars: set[str] = set() diff --git a/sponsio/formulas/formula.py b/sponsio/formulas/formula.py index eed790e..b9c7c0f 100644 --- a/sponsio/formulas/formula.py +++ b/sponsio/formulas/formula.py @@ -298,6 +298,15 @@ class Var(FormulaMixin, Term): Examples: ``Var("cost")``, ``Var("count", "tool")``. + Note: ``==`` / ``<`` / ``<=`` / ``>`` / ``>=`` are overloaded to + *build comparison AST nodes* (``Var("x") == 5`` returns + ``Eq(Var("x"), Const(5))``), SQLAlchemy-column style — they do NOT + return a bool. So ``Var("x") == Var("x")`` is a truthy ``Eq`` node, + not ``True``; don't rely on ``==`` to value-compare two ``Var`` + instances or to dedupe them in ordinary code. Hashing still works + (the frozen-dataclass ``__hash__`` is based on ``name``/``args``), + so ``Var`` is usable as a dict key / set member. + Attributes: name: Variable name. args: Optional positional arguments for parameterized variables. diff --git a/sponsio/integrations/base.py b/sponsio/integrations/base.py index 1268043..dedfe47 100644 --- a/sponsio/integrations/base.py +++ b/sponsio/integrations/base.py @@ -1374,6 +1374,17 @@ def filter_tools(self, candidates: list[str]) -> list[str]: via ``guard_before``. Treat ``filter_tools`` as a first-line defence, not a replacement for ``guard_before``. + Cost: O(len(candidates) × trace_length) per call. Each probe + appends a synthetic event and ``rollback_last_event`` resets the + verifier, so the *next* probe re-grounds the whole trace from + scratch rather than incrementally. That's fine for typical tool + menus and session lengths, but on a long-running agent with a + wide toolset it is the one spot that gives up the otherwise + incremental O(ΔN) grounding — call it once per turn, not per + candidate-per-turn, and lean on ``rotate_session`` to bound the + trace length. (A snapshot/restore fast path that avoids the full + re-ground is tracked as a follow-up.) + Args: candidates: Tool names the framework would normally expose to the agent for the next turn. diff --git a/sponsio/patterns/library.py b/sponsio/patterns/library.py index fe450d8..51a5a6a 100644 --- a/sponsio/patterns/library.py +++ b/sponsio/patterns/library.py @@ -299,6 +299,17 @@ def workflow_step( Returns: A ``DetFormula`` (NOT marked liveness — X is one-step bounded and the runtime can decide it after a single event, unlike F). + + Caveat (end-of-trace): under weak finite-trace semantics ``X`` is + vacuously true at the last position, so a ``trigger`` that fires on + the *final* event of a batch-verified trace incurs no violation + (there is no "next" event to inspect). In incremental enforce mode + this self-corrects — when the next event arrives, a non-matching + next action is blocked and rolled back, effectively forcing + ``next_action`` — but a whole-trace ``verify`` / replay will + silently pass a trailing trigger. Mirrors the ``rotate_session`` + liveness caveat; relevant only to post-hoc batch checks, not live + guarding. """ if not isinstance(trigger, Atom) or not isinstance(next_action, Atom): raise TypeError( diff --git a/sponsio/tracer/grounding.py b/sponsio/tracer/grounding.py index 6a6a779..7507671 100644 --- a/sponsio/tracer/grounding.py +++ b/sponsio/tracer/grounding.py @@ -467,6 +467,13 @@ def ground_event( # Emit unconditionally for every arg field on every tool_call — # Terms read by direct key lookup, not by content_atoms # extraction. + # + # Memory note: this stores the raw arg value (by reference) into + # the per-timestep valuation, which is retained for the whole + # trace. For tools with large or deeply-nested args that keeps + # those objects alive until ``reset`` / ``rotate_session``. Fine + # for typical scalar args; if a tool passes megabyte payloads, + # bound the trace with ``rotate_session``. if event.args: for _field, _val in event.args.items(): v[pred_key("arg_value", event.tool, _field)] = _val From 4aec07086a7ff780d463ea828820c271326b08fe Mon Sep 17 00:00:00 2001 From: donalddellapietra Date: Sun, 7 Jun 2026 20:52:12 -0400 Subject: [PATCH 5/7] docs: repair broken artifacts from the v0.2 em-dash sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The v0.2 cosmetic em-dash→period pass mangled a few spots beyond the intended restyle. Fix only the genuinely broken ones (not a wholesale revert, which would clash with deliberate rewordings): * library.py: an orphaned comment line ("# . Sponsio doesn't hard-code") where the subject got stranded onto the previous line. * monitor.rotate_session docstring: the whole body was re-indented by one space and contained a broken ". lose visibility" fragment; restore 8-space indentation and reflow the sentences. * module docstring first-lines that read as typos in help()/IDEs: "Pattern library. the...", "RuntimeMonitor. intercepts...", "BaseGuard. unified..." → proper em-dash openers. Co-Authored-By: Claude Opus 4.8 --- sponsio/integrations/base.py | 2 +- sponsio/patterns/library.py | 6 +-- sponsio/runtime/monitor.py | 84 ++++++++++++++++++------------------ 3 files changed, 46 insertions(+), 46 deletions(-) diff --git a/sponsio/integrations/base.py b/sponsio/integrations/base.py index dedfe47..7e7d2b4 100644 --- a/sponsio/integrations/base.py +++ b/sponsio/integrations/base.py @@ -1,4 +1,4 @@ -"""BaseGuard. unified parent class for all framework integrations. +"""BaseGuard — unified parent class for all framework integrations. Every framework adapter (LangGraph, MCP, CrewAI, etc.) inherits from BaseGuard. The base class owns all contract logic: diff --git a/sponsio/patterns/library.py b/sponsio/patterns/library.py index 51a5a6a..9858341 100644 --- a/sponsio/patterns/library.py +++ b/sponsio/patterns/library.py @@ -1,4 +1,4 @@ -"""Pattern library. the constraint primitive layer. +"""Pattern library — the constraint primitive layer. Patterns are the building blocks the rest of Sponsio compiles to: each function takes plain string args and returns a ``DetFormula`` (an LTL @@ -1951,8 +1951,8 @@ def delegation_depth_limit(max_depth: int, desc: str = "") -> DetFormula: # # Covers the runtime half of **ASI-03** (identity), **ASI-06** (memory # poisoning via content-source gating), and **ASI-07** (inter-agent -# comm via msg_verified gating). Users supply their own key convention -# . Sponsio doesn't hard-code "caller_id" vs "source" vs "msg_sender" +# comm via msg_verified gating). Users supply their own key convention; +# Sponsio doesn't hard-code "caller_id" vs "source" vs "msg_sender" # because each team has their own tagging scheme. # --------------------------------------------------------------------------- diff --git a/sponsio/runtime/monitor.py b/sponsio/runtime/monitor.py index 56c5032..a6c4c87 100644 --- a/sponsio/runtime/monitor.py +++ b/sponsio/runtime/monitor.py @@ -1,4 +1,4 @@ -"""RuntimeMonitor. intercepts agent actions and enforces det contracts. +"""RuntimeMonitor — intercepts agent actions and enforces det contracts. This is the central enforcement point. Every agent action flows through ``check_action()``, which runs the deterministic evaluation pipeline: @@ -377,47 +377,47 @@ def reset(self) -> None: def rotate_session(self) -> dict: """Begin a new session window; return a summary of what was flushed. - This is the **supported** way to bound memory in long-running - agents (24/7 service agents, always-on schedulers) without - losing contract enforcement. It behaves exactly like - :meth:`reset`. trace, log, spans, verifier cache, and atom - caches are all cleared; contracts on the underlying - :class:`~sponsio.models.system.System` are **not** touched. - The only difference is intent signalling and the return value: - callers get back the headline metrics of the window that just - closed so they can plumb them into audit logs / dashboards - before the numbers go away. - - Why not just keep using :meth:`reset`? - ``reset`` reads as "something went wrong, start over". - ``rotate_session`` is the name you want to see at a quarterly - review. "we rotate every 1000 turns to cap memory; here's the - hand-off record." - - Liveness caveat - --------------- - Formulas that span the **entire trace**. ``F(tool)`` / - ``always_followed_by(a, b)`` / whole-trace ``rate_limit(tool, N)`` - . lose visibility across the rotation boundary. Concretely: if - ``response`` was promised before ``rotate_session`` and still - hasn't happened, the post-rotation verifier won't see the - original ``trigger`` and can never fire the liveness violation. - To avoid silently eating obligations, this method refuses to - rotate while ``finish_session`` hasn't been called on a guard - with pending liveness obligations. but since ``RuntimeMonitor`` - doesn't know about guard-level ``finish_session``, the check - has to happen one layer up. See - :meth:`sponsio.integrations.base.BaseGuard.rotate_session` for - the guard-side handling: run ``finish_session`` first, then - rotate. - - Returns - ------- - dict - ``{"events": int, "turns": int, "log_entries": int, - "violations_cleared": 0}`` (``violations_cleared`` is always - 0 at the monitor layer. violations are tracked by - :class:`~sponsio.integrations.base.BaseGuard`, not here). + This is the **supported** way to bound memory in long-running + agents (24/7 service agents, always-on schedulers) without + losing contract enforcement. It behaves exactly like + :meth:`reset`: trace, log, spans, verifier cache, and atom + caches are all cleared; contracts on the underlying + :class:`~sponsio.models.system.System` are **not** touched. + The only difference is intent signalling and the return value: + callers get back the headline metrics of the window that just + closed so they can plumb them into audit logs / dashboards + before the numbers go away. + + Why not just keep using :meth:`reset`? + ``reset`` reads as "something went wrong, start over". + ``rotate_session`` is the name you want to see at a quarterly + review: "we rotate every 1000 turns to cap memory; here's the + hand-off record." + + Liveness caveat + --------------- + Formulas that span the **entire trace** — ``F(tool)`` / + ``always_followed_by(a, b)`` / whole-trace ``rate_limit(tool, N)`` + — lose visibility across the rotation boundary. Concretely: if + ``response`` was promised before ``rotate_session`` and still + hasn't happened, the post-rotation verifier won't see the + original ``trigger`` and can never fire the liveness violation. + To avoid silently eating obligations, this method refuses to + rotate while ``finish_session`` hasn't been called on a guard + with pending liveness obligations — but since ``RuntimeMonitor`` + doesn't know about guard-level ``finish_session``, the check + has to happen one layer up. See + :meth:`sponsio.integrations.base.BaseGuard.rotate_session` for + the guard-side handling: run ``finish_session`` first, then + rotate. + + Returns + ------- + dict + ``{"events": int, "turns": int, "log_entries": int, + "violations_cleared": 0}`` (``violations_cleared`` is always + 0 at the monitor layer — violations are tracked by + :class:`~sponsio.integrations.base.BaseGuard`, not here). """ with self._lock: summary = { From ea9670dd3c8f8502a73c31746da695d8eabb569c Mon Sep 17 00:00:00 2001 From: donalddellapietra Date: Sun, 7 Jun 2026 21:22:10 -0400 Subject: [PATCH 6/7] test: fix suite-wide setup errors and a false-positive sync failure Two pre-existing test-infra bugs that made a full `pytest tests/` run unusable (628 passed / 1684 errors); fixed independently of the v0.2 review changes. 1. conftest `_reset_rich_style_cache` (autouse) walks `gc.get_objects()` and calls `isinstance(obj, Style)` on every live object. CPython's isinstance consults the instance's `__class__` attribute, and once an openai-touching test has run, the SDK leaves lazy-import proxies in memory whose `__class__` getter imports `sounddevice` (voice helpers) or raises `OpenAIError`. That exception escaped the fixture and errored the *setup* of ~1680 unrelated tests. Wrap the isinstance probe in try/except so any object that objects to introspection is treated as "not a Style". 2. test_openclaw_artifact_sync compared the canonical plugin's npm build output (`plugins/sponsio-openclaw/dist/`, gitignored) against the committed bundled copy. On a fresh checkout that hasn't run `npm run build`, the source dist is absent and the test failed with "source missing". Skip when the build isn't present (CI builds first, so the check still fires there); a built tree that actually drifts still fails. Full suite now: 2296 passed, 26 skipped, 0 errors (random ordering). Co-Authored-By: Claude Opus 4.8 --- tests/conftest.py | 16 +++++++++++++++- tests/test_openclaw_artifact_sync.py | 16 ++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index e410e4a..aabdfb3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -76,7 +76,21 @@ def _reset_rich_style_cache(): if hasattr(Color, "parse") and hasattr(Color.parse, "cache_clear"): Color.parse.cache_clear() for obj in gc.get_objects(): - if isinstance(obj, Style): + # ``isinstance(obj, Style)`` reads ``obj.__class__`` (CPython + # consults the instance's ``__class__`` attribute, not just + # ``type(obj)``). Some live objects are lazy-import proxies + # whose ``__class__`` getter has side effects — e.g. once an + # openai-touching test has run, the openai SDK leaves proxies + # for its optional submodules in memory, and probing their + # ``__class__`` tries to import ``sounddevice`` (voice helpers) + # or raises ``OpenAIError``. That used to escape this autouse + # fixture and error the *setup* of every subsequent test. Treat + # any object that objects to introspection as "not a Style". + try: + is_style = isinstance(obj, Style) + except Exception: + continue + if is_style: try: obj._ansi = None except Exception: diff --git a/tests/test_openclaw_artifact_sync.py b/tests/test_openclaw_artifact_sync.py index eb3ff42..94314f6 100644 --- a/tests/test_openclaw_artifact_sync.py +++ b/tests/test_openclaw_artifact_sync.py @@ -23,6 +23,8 @@ from pathlib import Path +import pytest + REPO_ROOT = Path(__file__).resolve().parents[1] SRC_ROOT = REPO_ROOT / "plugins" / "sponsio-openclaw" DST_ROOT = REPO_ROOT / "sponsio" / "plugin" / "openclaw_artifact" @@ -38,6 +40,20 @@ def test_runtime_artifact_matches_canonical_plugin(): """Every file the sync script copies must be byte-identical.""" + # The canonical ``dist/`` is an npm build output and is gitignored. + # A fresh source checkout that hasn't run ``npm run build`` has + # nothing to compare against, so this guard can't run — skip rather + # than fail with a spurious "source missing". CI builds the plugin + # before pytest, so the sync check still fires there; a *built* tree + # that has actually drifted (bundled copy missing, or bytes differ) + # still fails below. + if not (SRC_ROOT / "dist" / "index.js").exists(): + pytest.skip( + "canonical plugin not built — plugins/sponsio-openclaw/dist/ is " + "absent. Run `cd plugins/sponsio-openclaw && npm install && " + "npm run build` to enable this sync check." + ) + diffs: list[str] = [] missing: list[str] = [] for rel in _SYNCED_FILES: From 1c07712bdcae5b12e48db93d90727595c30360b4 Mon Sep 17 00:00:00 2001 From: donalddellapietra Date: Sun, 7 Jun 2026 21:42:13 -0400 Subject: [PATCH 7/7] test: use asyncio.run instead of deprecated get_event_loop().run_until_complete asyncio.get_event_loop() emits a DeprecationWarning ("There is no current event loop") when no loop is running, which it isn't in these synchronous tests. asyncio.run(coro) is the modern equivalent: it creates a fresh loop, runs the coroutine, and closes the loop. Clears the suite's last warning; full run is now 2296 passed / 26 skipped / 0 warnings. Co-Authored-By: Claude Opus 4.8 --- tests/test_claude_agent_integration.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/tests/test_claude_agent_integration.py b/tests/test_claude_agent_integration.py index 5c978a6..2156526 100644 --- a/tests/test_claude_agent_integration.py +++ b/tests/test_claude_agent_integration.py @@ -56,9 +56,7 @@ def test_pre_tool_hook_blocks_violation(self): "agent_type": "test", } - result = asyncio.get_event_loop().run_until_complete( - pre_hook(input_data, "id_1", None) - ) + result = asyncio.run(pre_hook(input_data, "id_1", None)) assert result.get("hookSpecificOutput", {}).get("permissionDecision") == "deny" assert "Sponsio" in result.get("hookSpecificOutput", {}).get( @@ -85,9 +83,7 @@ def test_pre_tool_hook_allows_compliant_call(self): "agent_id": "test", "agent_type": "test", } - result1 = asyncio.get_event_loop().run_until_complete( - pre_hook(input_check, "id_1", None) - ) + result1 = asyncio.run(pre_hook(input_check, "id_1", None)) assert result1 == {} # Record that tool completed @@ -104,9 +100,7 @@ def test_pre_tool_hook_allows_compliant_call(self): "agent_id": "test", "agent_type": "test", } - result2 = asyncio.get_event_loop().run_until_complete( - pre_hook(input_refund, "id_2", None) - ) + result2 = asyncio.run(pre_hook(input_refund, "id_2", None)) assert result2 == {} def test_pre_tool_hook_rate_limit(self): @@ -129,16 +123,12 @@ def test_pre_tool_hook_rate_limit(self): } # First call — allowed - r1 = asyncio.get_event_loop().run_until_complete( - pre_hook(input_data, "id_1", None) - ) + r1 = asyncio.run(pre_hook(input_data, "id_1", None)) assert r1 == {} guard.guard_after("issue_refund", "done") # Second call — blocked - r2 = asyncio.get_event_loop().run_until_complete( - pre_hook(input_data, "id_2", None) - ) + r2 = asyncio.run(pre_hook(input_data, "id_2", None)) assert r2.get("hookSpecificOutput", {}).get("permissionDecision") == "deny" def test_last_check_updated(self): @@ -162,7 +152,7 @@ def test_last_check_updated(self): "agent_type": "test", } - asyncio.get_event_loop().run_until_complete(pre_hook(input_data, "id_1", None)) + asyncio.run(pre_hook(input_data, "id_1", None)) assert guard.last_check is not None assert guard.last_check.allowed @@ -198,9 +188,7 @@ def test_system_message_contains_tool_name(self): "agent_type": "test", } - result = asyncio.get_event_loop().run_until_complete( - pre_hook(input_data, "id_1", None) - ) + result = asyncio.run(pre_hook(input_data, "id_1", None)) msg = result.get("systemMessage", "") assert "issue_refund" in msg