diff --git a/.cargo/audit.toml b/.cargo/audit.toml index 7abf9bc..024a4d4 100644 --- a/.cargo/audit.toml +++ b/.cargo/audit.toml @@ -5,7 +5,8 @@ # RUSTSEC-2026-0098 and RUSTSEC-2026-0099 affect rustls-webpki 0.101.7. # This version is pulled in transitively by aws-smithy-http-client (via rustls 0.21.x), # which is part of the AWS SDK for Bedrock. The 0.101.x branch of rustls-webpki -# has no patched release; the fix only exists in >= 0.103.12. The AWS SDK has not yet +# has no patched release; fixes only exist in >= 0.103.12 / >= 0.103.13 depending on advisory. +# The AWS SDK has not yet # updated to use rustls 0.23.x (which would bring in rustls-webpki 0.103.12+). # Impact is limited: the vulnerability requires certificate misissuance and is only # reachable after signature verification. We already use rustls-webpki 0.103.12 for @@ -14,4 +15,8 @@ ignore = [ "RUSTSEC-2026-0098", "RUSTSEC-2026-0099", + "RUSTSEC-2026-0104", + # Transitive via wreq -> lru ^0.13; no patched 0.13.x release is available. + # Track upstream migration to wreq >= 6.x (or another backend) to remove this. + "RUSTSEC-2026-0002", ] diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dcc911..a28ecd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,34 @@ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] +## [0.9.0] — 2026-04-22 + +### Added + +- **Shadow Judge completion oracle** — EdgeCrab can now run an opt-in secondary LLM verdict before accepting a run as complete. The judge is session-scoped, bounded by a per-session cap, and can veto likely premature stops by injecting a concrete continuation hint back into the loop instead of forcing the user to type `continue` manually. +- **`/shadow-judge` TUI command and picker** — New slash command with `on`, `off`, `toggle`, and `status` modes, plus an interactive picker when invoked without arguments. The status bar now surfaces whether the completion oracle is active and briefly shows intervention notices when it keeps a run going. +- **Structured task-status signaling for the harness** — New `report_task_status` tool and shared harness types let the model declare `in_progress`, `blocked`, or `completed` milestones with evidence and remaining steps, without letting the tool itself terminate the run. + +### Changed + +- **Completion gating is more robust after tool use** — repeated malformed tool retries are now suppressed semantically instead of only by exact payload fingerprint, and the conversation loop injects a corrective user nudge when it detects an argument-retry loop. +- **File and terminal tools now return better machine-readable execution metadata** — `write_file` reports line counts, `patch` reports before/after line totals, `terminal` reports truncation in its header, and `search_files` emits pagination summaries so the agent can paginate instead of re-running the same search blindly. +- **Tool schemas were tightened to reduce avoidable self-inflicted tool failures** — `write_file` no longer falsely requires `create_dirs`, several tools now spell out required arguments more clearly, and delegate/search/terminal/web parameter docs now better match runtime behavior. + +### Documentation + +- Added the Shadow Judge and harness hardening story to the release-facing README, changelog, and Astro docs so the new release is documented consistently across the CLI repo and site. + +### Verification + +| Check | Result | +|-------|--------| +| `./scripts/release-version.sh check` | **passed locally before cut** | +| `cargo test -p edgecrab-core --lib` | **passed locally before cut** | +| `cargo test -p edgecrab-tools --lib` | **passed locally before cut** | +| `cargo test --workspace` | **passed locally before cut** | +| `fnm exec --using v22.12.0 pnpm build` in `site/` | **passed locally before cut** | + ## [0.8.0] — 2026-04-21 ### Added diff --git a/Cargo.lock b/Cargo.lock index 2aa3319..4446455 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -16,7 +16,7 @@ checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" dependencies = [ "cfg-if", "cipher", - "cpufeatures", + "cpufeatures 0.2.17", ] [[package]] @@ -42,24 +42,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "aligned" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee4508988c62edf04abd8d92897fca0c2995d907ce1dfeaf369dac3716a40685" -dependencies = [ - "as-slice", -] - -[[package]] -name = "aligned-vec" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b" -dependencies = [ - "equator", -] - [[package]] name = "alloc-no-stdlib" version = "2.0.4" @@ -177,7 +159,7 @@ dependencies = [ "objc2-foundation", "parking_lot", "percent-encoding", - "windows-sys 0.59.0", + "windows-sys 0.60.2", "x11rb", ] @@ -190,32 +172,6 @@ dependencies = [ "rustversion", ] -[[package]] -name = "arg_enum_proc_macro" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "arrayvec" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" - -[[package]] -name = "as-slice" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "516b6b4f0e40d50dcda9365d53964ec74560ad4284da2e7fc97122cd83174516" -dependencies = [ - "stable_deref_trait", -] - [[package]] name = "async-compression" version = "0.4.41" @@ -262,7 +218,7 @@ dependencies = [ "eventsource-stream", "futures", "getrandom 0.3.4", - "rand 0.9.2", + "rand 0.9.4", "reqwest", "reqwest-eventsource", "secrecy", @@ -342,54 +298,11 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "av-scenechange" -version = "0.14.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f321d77c20e19b92c39e7471cf986812cbb46659d2af674adc4331ef3f18394" -dependencies = [ - "aligned", - "anyhow", - "arg_enum_proc_macro", - "arrayvec", - "log", - "num-rational", - "num-traits", - "pastey", - "rayon", - "thiserror 2.0.18", - "v_frame", - "y4m", -] - -[[package]] -name = "av1-grain" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cfddb07216410377231960af4fcab838eaa12e013417781b78bd95ee22077f8" -dependencies = [ - "anyhow", - "arrayvec", - "log", - "nom 8.0.0", - "num-rational", - "v_frame", -] - -[[package]] -name = "avif-serialize" -version = "0.8.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "375082f007bd67184fb9c0374614b29f9aaa604ec301635f72338bb65386a53d" -dependencies = [ - "arrayvec", -] - [[package]] name = "aws-config" -version = "1.8.15" +version = "1.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11493b0bad143270fb8ad284a096dd529ba91924c5409adeac856cc1bf047dbc" +checksum = "50f156acdd2cf55f5aa53ee416c4ac851cf1222694506c0b1f78c85695e9ca9d" dependencies = [ "aws-credential-types", "aws-runtime", @@ -429,9 +342,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.16.2" +version = "1.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" +checksum = "0ec6fb3fe69024a75fa7e1bfb48aa6cf59706a101658ea01bfd33b2b248a038f" dependencies = [ "aws-lc-sys", "zeroize", @@ -439,9 +352,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.39.1" +version = "0.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399" +checksum = "f50037ee5e1e41e7b8f9d161680a725bd1626cb6f8c7e901f91f942850852fe7" dependencies = [ "cc", "cmake", @@ -451,9 +364,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.7.2" +version = "1.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fc0651c57e384202e47153c1260b84a9936e19803d747615edf199dc3b98d17" +checksum = "5dcd93c82209ac7413532388067dce79be5a8780c1786e5fae3df22e4dee2864" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -477,9 +390,9 @@ dependencies = [ [[package]] name = "aws-sdk-bedrock" -version = "1.140.0" +version = "1.141.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "930086d6a615f77948c9244edff57d5ea4d7827302152652c24a16bf1430aff2" +checksum = "8a16484ce62b16cadf941e1c408d9b73afb9e6fc456573e5a9681e38d45fb00a" dependencies = [ "aws-credential-types", "aws-runtime", @@ -501,9 +414,9 @@ dependencies = [ [[package]] name = "aws-sdk-bedrockruntime" -version = "1.129.0" +version = "1.130.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c710f0b7dbd906047724ec892afc0de0b92c7484ba25f499a91563e0417a96d6" +checksum = "3e2f7bca252e3c5c8f0ed12c5501bf8b0fbadb937cd9fdd71a0ebd9d7526540f" dependencies = [ "aws-credential-types", "aws-runtime", @@ -528,9 +441,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.97.0" +version = "1.98.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aadc669e184501caaa6beafb28c6267fc1baef0810fb58f9b205485ca3f2567" +checksum = "d69c77aafa20460c68b6b3213c84f6423b6e76dbf89accd3e1789a686ffd9489" dependencies = [ "aws-credential-types", "aws-runtime", @@ -552,9 +465,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.99.0" +version = "1.100.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1342a7db8f358d3de0aed2007a0b54e875458e39848d54cc1d46700b2bfcb0a8" +checksum = "1c7e7b09346d5ca22a2a08267555843a6a0127fb20d8964cb6ecfb8fdb190225" dependencies = [ "aws-credential-types", "aws-runtime", @@ -576,9 +489,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.101.0" +version = "1.103.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab41ad64e4051ecabeea802d6a17845a91e83287e1dd249e6963ea1ba78c428a" +checksum = "c2249b81a2e73a8027c41c378463a81ec39b8510f184f2caab87de912af0f49b" dependencies = [ "aws-credential-types", "aws-runtime", @@ -601,9 +514,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.4.2" +version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0b660013a6683ab23797778e21f1f854744fdf05f68204b4cca4c8c04b5d1f4" +checksum = "68dc0b907359b120170613b5c09ccc61304eac3998ff6274b97d93ee6490115a" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -613,11 +526,11 @@ dependencies = [ "bytes", "form_urlencoded", "hex", - "hmac", + "hmac 0.13.0", "http 0.2.12", "http 1.4.0", "percent-encoding", - "sha2", + "sha2 0.11.0", "time", "tracing", ] @@ -683,11 +596,11 @@ dependencies = [ "hyper 0.14.32", "hyper 1.9.0", "hyper-rustls 0.24.2", - "hyper-rustls 0.27.7", + "hyper-rustls 0.27.9", "hyper-util", "pin-project-lite", "rustls 0.21.12", - "rustls 0.23.37", + "rustls 0.23.38", "rustls-native-certs", "rustls-pki-types", "tokio", @@ -726,9 +639,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.10.3" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "028999056d2d2fd58a697232f9eec4a643cf73a71cf327690a7edad1d2af2110" +checksum = "0504b1ab12debb5959e5165ee5fe97dd387e7aa7ea6a477bfd7635dfe769a4f5" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -751,11 +664,12 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.11.6" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "876ab3c9c29791ba4ba02b780a3049e21ec63dabda09268b175272c3733a79e6" +checksum = "b71a13df6ada0aafbf21a73bdfcdf9324cfa9df77d96b8446045be3cde61b42e" dependencies = [ "aws-smithy-async", + "aws-smithy-runtime-api-macros", "aws-smithy-types", "bytes", "http 0.2.12", @@ -766,6 +680,17 @@ dependencies = [ "zeroize", ] +[[package]] +name = "aws-smithy-runtime-api-macros" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d7396fd9500589e62e460e987ecb671bad374934e55ec3b5f498cc7a8a8a7b7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "aws-smithy-types" version = "1.4.7" @@ -803,9 +728,9 @@ dependencies = [ [[package]] name = "aws-types" -version = "1.3.14" +version = "1.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47c8323699dd9b3c8d5b3c13051ae9cdef58fd179957c882f8374dd8725962d9" +checksum = "2f4bbcaa9304ea40902d3d5f42a0428d1bd895a2b0f6999436fb279ffddc58ac" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -880,7 +805,7 @@ dependencies = [ "getrandom 0.2.17", "instant", "pin-project-lite", - "rand 0.8.5", + "rand 0.8.6", "tokio", ] @@ -912,7 +837,7 @@ version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "cexpr", "clang-sys", "itertools 0.13.0", @@ -954,12 +879,6 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" -[[package]] -name = "bit_field" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e4b40c7323adcfc0a41c4b88143ed58346ff65a288fc144329c5c45e05d70c6" - [[package]] name = "bitflags" version = "1.3.2" @@ -968,26 +887,26 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.11.0" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" [[package]] -name = "bitstream-io" -version = "4.9.0" +name = "block-buffer" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60d4bd9d1db2c6bdf285e223a7fa369d5ce98ec767dec949c6ca62863ce61757" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ - "core2", + "generic-array", ] [[package]] name = "block-buffer" -version = "0.10.4" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" dependencies = [ - "generic-array", + "hybrid-array", ] [[package]] @@ -1016,12 +935,12 @@ dependencies = [ "http-body-util", "hyper 1.9.0", "hyper-named-pipe", - "hyper-rustls 0.27.7", + "hyper-rustls 0.27.9", "hyper-util", "hyperlocal", "log", "pin-project-lite", - "rustls 0.23.37", + "rustls 0.23.38", "rustls-native-certs", "rustls-pemfile", "rustls-pki-types", @@ -1068,7 +987,7 @@ version = "4.15.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4470e96bd94533c2f88c08be95a8e6d2d37a3b497a773b0a46273a376978f00" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "boring-sys2", "brotli", "flate2", @@ -1110,24 +1029,12 @@ dependencies = [ "serde", ] -[[package]] -name = "built" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4ad8f11f288f48ca24471bbd51ac257aaeaaa07adae295591266b792902ae64" - [[package]] name = "bumpalo" version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" -[[package]] -name = "bytecount" -version = "0.6.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" - [[package]] name = "bytemuck" version = "1.25.0" @@ -1182,9 +1089,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.59" +version = "1.2.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283" +checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20" dependencies = [ "find-msvc-tools", "jobserver", @@ -1249,7 +1156,7 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" dependencies = [ - "crypto-common", + "crypto-common 0.1.7", "inout", ] @@ -1266,9 +1173,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.6.0" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" dependencies = [ "clap_builder", "clap_derive", @@ -1288,9 +1195,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.6.0" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" dependencies = [ "heck", "proc-macro2", @@ -1323,10 +1230,10 @@ dependencies = [ ] [[package]] -name = "color_quant" -version = "1.1.0" +name = "cmov" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" +checksum = "3f88a43d011fc4a6876cb7344703e297c71dda42494fee094d5f7c76bf13f746" [[package]] name = "colorchoice" @@ -1378,6 +1285,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "const-oid" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" + [[package]] name = "const-random" version = "0.1.18" @@ -1433,19 +1346,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] -name = "core2" -version = "0.4.0" +name = "cpufeatures" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" dependencies = [ - "memchr", + "libc", ] [[package]] name = "cpufeatures" -version = "0.2.17" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" dependencies = [ "libc", ] @@ -1470,25 +1383,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "crossbeam-deque" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" -dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" -dependencies = [ - "crossbeam-utils", -] - [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -1501,7 +1395,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "crossterm_winapi", "derive_more", "document-features", @@ -1538,6 +1432,15 @@ dependencies = [ "typenum", ] +[[package]] +name = "crypto-common" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77727bb15fa921304124b128af125e7e3b968275d1b108b379190264f4423710" +dependencies = [ + "hybrid-array", +] + [[package]] name = "csscolorparser" version = "0.6.2" @@ -1564,6 +1467,15 @@ version = "0.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7ab264ea985f1bd27887d7b21ea2bb046728e05d11909ca138d700c494730db" +[[package]] +name = "ctutils" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5515a3834141de9eafb9717ad39eea8247b5674e6066c404e8c4b365d2a29e" +dependencies = [ + "cmov", +] + [[package]] name = "darling" version = "0.20.11" @@ -1751,11 +1663,23 @@ version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ - "block-buffer", - "crypto-common", + "block-buffer 0.10.4", + "crypto-common 0.1.7", "subtle", ] +[[package]] +name = "digest" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4850db49bf08e663084f7fb5c87d202ef91a3907271aff24a94eb97ff039153c" +dependencies = [ + "block-buffer 0.12.0", + "const-oid", + "crypto-common 0.2.1", + "ctutils", +] + [[package]] name = "dirs" version = "5.0.1" @@ -1804,7 +1728,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "objc2", ] @@ -1884,7 +1808,7 @@ dependencies = [ [[package]] name = "edgecrab-acp" -version = "0.8.0" +version = "0.9.0" dependencies = [ "anyhow", "async-trait", @@ -1904,7 +1828,7 @@ dependencies = [ [[package]] name = "edgecrab-cli" -version = "0.8.0" +version = "0.9.0" dependencies = [ "anyhow", "arboard", @@ -1930,14 +1854,14 @@ dependencies = [ "edgequake-llm", "flate2", "libc", - "rand 0.9.2", + "rand 0.9.4", "ratatui", "reqwest", "serde", "serde_json", "serde_norway", "serial_test", - "sha2", + "sha2 0.10.9", "shell-words", "shellexpand", "similar", @@ -1957,11 +1881,11 @@ dependencies = [ [[package]] name = "edgecrab-command-catalog" -version = "0.8.0" +version = "0.9.0" [[package]] name = "edgecrab-core" -version = "0.8.0" +version = "0.9.0" dependencies = [ "anyhow", "async-trait", @@ -1994,7 +1918,7 @@ dependencies = [ [[package]] name = "edgecrab-cron" -version = "0.8.0" +version = "0.9.0" dependencies = [ "anyhow", "chrono", @@ -2013,7 +1937,7 @@ dependencies = [ [[package]] name = "edgecrab-gateway" -version = "0.8.0" +version = "0.9.0" dependencies = [ "aes", "anyhow", @@ -2034,10 +1958,10 @@ dependencies = [ "edgecrab-types", "edgequake-llm", "futures", - "hmac", + "hmac 0.12.1", "lettre", "md5", - "rand 0.9.2", + "rand 0.9.4", "regex", "reqwest", "serde", @@ -2045,7 +1969,7 @@ dependencies = [ "serde_norway", "serial_test", "sha1", - "sha2", + "sha2 0.10.9", "subtle", "tempfile", "tokio", @@ -2060,7 +1984,7 @@ dependencies = [ [[package]] name = "edgecrab-lsp" -version = "0.8.0" +version = "0.9.0" dependencies = [ "anyhow", "async-lsp", @@ -2089,7 +2013,7 @@ dependencies = [ [[package]] name = "edgecrab-migrate" -version = "0.8.0" +version = "0.9.0" dependencies = [ "anyhow", "chrono", @@ -2122,7 +2046,7 @@ dependencies = [ [[package]] name = "edgecrab-plugins" -version = "0.8.0" +version = "0.9.0" dependencies = [ "anyhow", "async-trait", @@ -2138,7 +2062,7 @@ dependencies = [ "serde", "serde_json", "serde_norway", - "sha2", + "sha2 0.10.9", "tempfile", "thiserror 2.0.18", "tokio", @@ -2167,7 +2091,7 @@ dependencies = [ [[package]] name = "edgecrab-sdk" -version = "0.8.0" +version = "0.9.0" dependencies = [ "async-trait", "edgecrab-sdk-core", @@ -2180,7 +2104,7 @@ dependencies = [ [[package]] name = "edgecrab-sdk-core" -version = "0.8.0" +version = "0.9.0" dependencies = [ "async-trait", "edgecrab-core", @@ -2199,7 +2123,7 @@ dependencies = [ [[package]] name = "edgecrab-sdk-macros" -version = "0.8.0" +version = "0.9.0" dependencies = [ "proc-macro2", "quote", @@ -2208,7 +2132,7 @@ dependencies = [ [[package]] name = "edgecrab-security" -version = "0.8.0" +version = "0.9.0" dependencies = [ "aho-corasick", "edgecrab-types", @@ -2229,12 +2153,12 @@ dependencies = [ [[package]] name = "edgecrab-state" -version = "0.8.0" +version = "0.9.0" dependencies = [ "chrono", "dirs 5.0.1", "edgecrab-types", - "rand 0.9.2", + "rand 0.9.4", "rusqlite", "serde", "serde_json", @@ -2246,7 +2170,7 @@ dependencies = [ [[package]] name = "edgecrab-tools" -version = "0.8.0" +version = "0.9.0" dependencies = [ "async-trait", "base64 0.22.1", @@ -2259,7 +2183,6 @@ dependencies = [ "edgecrab-security", "edgecrab-state", "edgecrab-types", - "edgeparse-core", "edgequake-llm", "futures", "image", @@ -2273,7 +2196,7 @@ dependencies = [ "serde", "serde_json", "serde_norway", - "sha2", + "sha2 0.10.9", "shell-words", "shellexpand", "strip-ansi-escapes", @@ -2291,7 +2214,7 @@ dependencies = [ [[package]] name = "edgecrab-types" -version = "0.8.0" +version = "0.9.0" dependencies = [ "chrono", "edgequake-llm", @@ -2304,30 +2227,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "edgeparse-core" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "908549dc318f24a5067dad05ae009cfea49283f0a5d6ebb0c2bc09e742f37184" -dependencies = [ - "anyhow", - "base64 0.22.1", - "euclid", - "image", - "indexmap 2.13.1", - "log", - "ordered-float", - "pdf-cos", - "rayon", - "regex", - "serde", - "serde_json", - "thiserror 2.0.18", - "ttf-parser", - "unicode-normalization", - "zip", -] - [[package]] name = "edgequake-llm" version = "0.6.12" @@ -2393,41 +2292,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" -[[package]] -name = "encoding_rs" -version = "0.8.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" -dependencies = [ - "cfg-if", -] - [[package]] name = "env_home" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7f84e12ccf0a7ddc17a6c41c93326024c42920d7ee630d04950e6926645c0fe" -[[package]] -name = "equator" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" -dependencies = [ - "equator-macro", -] - -[[package]] -name = "equator-macro" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - [[package]] name = "equivalent" version = "1.0.2" @@ -2470,21 +2340,6 @@ dependencies = [ "pin-project-lite", ] -[[package]] -name = "exr" -version = "1.74.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4300e043a56aa2cb633c01af81ca8f699a321879a7854d3896a0ba89056363be" -dependencies = [ - "bit_field", - "half", - "lebe", - "miniz_oxide", - "rayon-core", - "smallvec", - "zune-inflate", -] - [[package]] name = "fallible-iterator" version = "0.3.0" @@ -2845,16 +2700,6 @@ dependencies = [ "wasip3", ] -[[package]] -name = "gif" -version = "0.14.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5df2ba84018d80c213569363bdcd0c64e6933c67fe4c1d60ecf822971a3c35e" -dependencies = [ - "color_quant", - "weezl", -] - [[package]] name = "glob" version = "0.3.3" @@ -2873,7 +2718,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap 2.13.1", + "indexmap 2.14.0", "slab", "tokio", "tokio-util", @@ -2892,7 +2737,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.4.0", - "indexmap 2.13.1", + "indexmap 2.14.0", "slab", "tokio", "tokio-util", @@ -2945,6 +2790,12 @@ dependencies = [ "foldhash 0.2.0", ] +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" + [[package]] name = "hashlink" version = "0.9.1" @@ -2972,7 +2823,16 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" dependencies = [ - "digest", + "digest 0.10.7", +] + +[[package]] +name = "hmac" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6303bc9732ae41b04cb554b844a762b4115a61bfaa81e3e83050991eeb56863f" +dependencies = [ + "digest 0.11.2", ] [[package]] @@ -3062,7 +2922,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.4.0", - "indexmap 2.13.1", + "indexmap 2.14.0", "slab", "tokio", "tokio-util", @@ -3080,6 +2940,15 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +[[package]] +name = "hybrid-array" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3944cf8cf766b40e2a1a333ee5e9b563f854d5fa49d6a8ca2764e97c6eddb214" +dependencies = [ + "typenum", +] + [[package]] name = "hyper" version = "0.14.32" @@ -3158,20 +3027,19 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.7" +version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ "http 1.4.0", "hyper 1.9.0", "hyper-util", - "rustls 0.23.37", + "rustls 0.23.38", "rustls-native-certs", - "rustls-pki-types", "tokio", "tokio-rustls 0.26.4", "tower-service", - "webpki-roots 1.0.6", + "webpki-roots 1.0.7", ] [[package]] @@ -3379,38 +3247,14 @@ checksum = "85ab80394333c02fe689eaf900ab500fbd0c2213da414687ebf995a65d5a6104" dependencies = [ "bytemuck", "byteorder-lite", - "color_quant", - "exr", - "gif", - "image-webp", "moxcms", "num-traits", "png", - "qoi", - "ravif", - "rayon", - "rgb", "tiff", "zune-core", "zune-jpeg", ] -[[package]] -name = "image-webp" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3" -dependencies = [ - "byteorder-lite", - "quick-error 2.0.1", -] - -[[package]] -name = "imgref" -version = "1.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7c5cedc30da3a610cac6b4ba17597bdf7152cf974e8aab3afb3d54455e371c8" - [[package]] name = "indexmap" version = "1.9.3" @@ -3424,12 +3268,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.13.1" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.16.1", + "hashbrown 0.17.0", "serde", "serde_core", ] @@ -3487,17 +3331,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "interpolate_name" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - [[package]] name = "inventory" version = "0.3.24" @@ -3553,47 +3386,6 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" -[[package]] -name = "jiff" -version = "0.2.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" -dependencies = [ - "jiff-static", - "jiff-tzdb-platform", - "log", - "portable-atomic", - "portable-atomic-util", - "serde_core", - "windows-sys 0.61.2", -] - -[[package]] -name = "jiff-static" -version = "0.2.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "jiff-tzdb" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c900ef84826f1338a557697dc8fc601df9ca9af4ac137c7fb61d4c6f2dfd3076" - -[[package]] -name = "jiff-tzdb-platform" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" -dependencies = [ - "jiff-tzdb", -] - [[package]] name = "jobserver" version = "0.1.34" @@ -3606,9 +3398,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.94" +version = "0.3.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9" +checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca" dependencies = [ "cfg-if", "futures-util", @@ -3645,12 +3437,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" -[[package]] -name = "lebe" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" - [[package]] name = "lettre" version = "0.11.21" @@ -3669,28 +3455,18 @@ dependencies = [ "nom 8.0.0", "percent-encoding", "quoted_printable", - "rustls 0.23.37", + "rustls 0.23.38", "socket2 0.6.3", "tokio", "url", - "webpki-roots 1.0.6", + "webpki-roots 1.0.7", ] [[package]] -name = "libc" -version = "0.2.184" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" - -[[package]] -name = "libfuzzer-sys" -version = "0.4.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f12a681b7dd8ce12bff52488013ba614b869148d54dd79836ab85aafdd53f08d" -dependencies = [ - "arbitrary", - "cc", -] +name = "libc" +version = "0.2.185" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f" [[package]] name = "libloading" @@ -3714,11 +3490,11 @@ dependencies = [ [[package]] name = "libredox" -version = "0.1.15" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08" +checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "libc", "plain", "redox_syscall 0.7.4", @@ -3741,7 +3517,7 @@ version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f50e8f47623268b5407192d26876c4d7f89d686ca130fdc53bced4814cd29f8" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", ] [[package]] @@ -3792,15 +3568,6 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" -[[package]] -name = "loop9" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fae87c125b03c1d2c0150c90365d7d6bcc53fb73a9acaef207d2d065860f062" -dependencies = [ - "imgref", -] - [[package]] name = "lru" version = "0.13.0" @@ -3809,9 +3576,9 @@ checksum = "227748d55f2f0ab4735d87fd623798cb6b664512fe979705f829c9f81c934465" [[package]] name = "lru" -version = "0.16.3" +version = "0.16.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39" dependencies = [ "hashbrown 0.16.1", ] @@ -3873,26 +3640,6 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" -[[package]] -name = "maybe-rayon" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" -dependencies = [ - "cfg-if", - "rayon", -] - -[[package]] -name = "md-5" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" -dependencies = [ - "cfg-if", - "digest", -] - [[package]] name = "md5" version = "0.7.0" @@ -4006,7 +3753,7 @@ version = "3.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa73b028610e2b26e9e40bd2c8ff8a98e6d7ed5d67d89ebf4bfd2f992616b024" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "ctor", "futures", "napi-build", @@ -4060,19 +3807,13 @@ dependencies = [ "libloading 0.9.0", ] -[[package]] -name = "new_debug_unreachable" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" - [[package]] name = "nix" version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "cfg-if", "cfg_aliases 0.1.1", "libc", @@ -4084,7 +3825,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "cfg-if", "cfg_aliases 0.2.1", "libc", @@ -4125,17 +3866,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "nom_locate" -version = "5.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d" -dependencies = [ - "bytecount", - "memchr", - "nom 8.0.0", -] - [[package]] name = "non-zero-byte-slice" version = "0.1.0" @@ -4145,12 +3875,6 @@ dependencies = [ "serde", ] -[[package]] -name = "noop_proc_macro" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" - [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -4160,16 +3884,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "num-bigint" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" -dependencies = [ - "num-integer", - "num-traits", -] - [[package]] name = "num-conv" version = "0.2.1" @@ -4196,17 +3910,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-bigint", - "num-integer", - "num-traits", -] - [[package]] name = "num-traits" version = "0.2.19" @@ -4240,7 +3943,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d49e936b501e5c5bf01fda3a9452ff86dc3ea98ad5f283e1455153142d97518c" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "objc2", "objc2-core-graphics", "objc2-foundation", @@ -4252,7 +3955,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "dispatch2", "objc2", ] @@ -4263,7 +3966,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e022c9d066895efa1345f8e33e584b9f958da2fd4cd116792e15e07e4720a807" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "dispatch2", "objc2", "objc2-core-foundation", @@ -4282,7 +3985,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "objc2", "objc2-core-foundation", ] @@ -4293,7 +3996,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "180788110936d59bab6bd83b6060ffdfffb3b922ba1396b312ae795e1de9d81d" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "objc2", "objc2-core-foundation", ] @@ -4400,7 +4103,7 @@ dependencies = [ "glob", "opentelemetry", "percent-encoding", - "rand 0.8.5", + "rand 0.8.6", "thiserror 1.0.69", ] @@ -4417,8 +4120,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951" dependencies = [ "num-traits", - "rand 0.8.5", - "serde", ] [[package]] @@ -4450,50 +4151,6 @@ dependencies = [ "windows-link 0.2.1", ] -[[package]] -name = "paste" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" - -[[package]] -name = "pastey" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" - -[[package]] -name = "pdf-cos" -version = "0.39.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3eb7282100321705650cf9b49f730ef83811f8f8834c1da810470db70780771b" -dependencies = [ - "aes", - "bitflags 2.11.0", - "cbc", - "chrono", - "ecb", - "encoding_rs", - "flate2", - "getrandom 0.3.4", - "indexmap 2.13.1", - "itoa", - "jiff", - "log", - "md-5", - "nom 8.0.0", - "nom_locate", - "rand 0.9.2", - "rangemap", - "rayon", - "sha2", - "stringprep", - "thiserror 2.0.18", - "time", - "ttf-parser", - "weezl", -] - [[package]] name = "percent-encoding" version = "2.3.2" @@ -4540,7 +4197,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89815c69d36021a140146f26659a81d6c2afa33d216d736dd4be5381a7362220" dependencies = [ "pest", - "sha2", + "sha2 0.10.9", ] [[package]] @@ -4579,7 +4236,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ "phf_shared 0.11.3", - "rand 0.8.5", + "rand 0.8.6", ] [[package]] @@ -4627,9 +4284,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.32" +version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" [[package]] name = "plain" @@ -4643,7 +4300,7 @@ version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "crc32fast", "fdeflate", "flate2", @@ -4656,15 +4313,6 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" -[[package]] -name = "portable-atomic-util" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" -dependencies = [ - "portable-atomic", -] - [[package]] name = "portable-pty" version = "0.9.0" @@ -4755,25 +4403,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "profiling" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3eb8486b569e12e2c32ad3e204dbaba5e4b5b216e9367044f25f1dba42341773" -dependencies = [ - "profiling-procmacros", -] - -[[package]] -name = "profiling-procmacros" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52717f9a02b6965224f95ca2a81e2e0c5c43baacd28ca057577988930b6c3d5b" -dependencies = [ - "quote", - "syn 2.0.117", -] - [[package]] name = "proptest" version = "1.11.0" @@ -4782,9 +4411,9 @@ checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" dependencies = [ "bit-set 0.8.0", "bit-vec 0.8.0", - "bitflags 2.11.0", + "bitflags 2.11.1", "num-traits", - "rand 0.9.2", + "rand 0.9.4", "rand_chacha 0.9.0", "rand_xorshift", "regex-syntax", @@ -4795,9 +4424,9 @@ dependencies = [ [[package]] name = "pxfm" -version = "0.1.28" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5a041e753da8b807c9255f28de81879c78c876392ff2469cde94799b2896b9d" +checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f" [[package]] name = "pyo3" @@ -4862,15 +4491,6 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "qoi" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001" -dependencies = [ - "bytemuck", -] - [[package]] name = "quick-error" version = "1.2.3" @@ -4895,7 +4515,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash 2.1.2", - "rustls 0.23.37", + "rustls 0.23.38", "socket2 0.6.3", "thiserror 2.0.18", "tokio", @@ -4912,10 +4532,10 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.2", + "rand 0.9.4", "ring", "rustc-hash 2.1.2", - "rustls 0.23.37", + "rustls 0.23.38", "rustls-pki-types", "slab", "thiserror 2.0.18", @@ -4935,7 +4555,7 @@ dependencies = [ "once_cell", "socket2 0.6.3", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -4967,21 +4587,20 @@ checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" [[package]] name = "rand" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" dependencies = [ "libc", "rand_chacha 0.3.1", "rand_core 0.6.4", - "serde", ] [[package]] name = "rand" -version = "0.9.2" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.5", @@ -5014,7 +4633,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom 0.2.17", - "serde", ] [[package]] @@ -5035,12 +4653,6 @@ dependencies = [ "rand_core 0.9.5", ] -[[package]] -name = "rangemap" -version = "1.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68" - [[package]] name = "ratatui" version = "0.30.0" @@ -5061,13 +4673,13 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ef8dea09a92caaf73bff7adb70b76162e5937524058a7e5bff37869cbbec293" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "compact_str", "hashbrown 0.16.1", "indoc", "itertools 0.14.0", "kasuari", - "lru 0.16.3", + "lru 0.16.4", "strum", "thiserror 2.0.18", "unicode-segmentation", @@ -5113,7 +4725,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7dbfa023cd4e604c2553483820c5fe8aa9d71a42eea5aa77c6e7f35756612db" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "hashbrown 0.16.1", "indoc", "instability", @@ -5126,83 +4738,13 @@ dependencies = [ "unicode-width", ] -[[package]] -name = "rav1e" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43b6dd56e85d9483277cde964fd1bdb0428de4fec5ebba7540995639a21cb32b" -dependencies = [ - "aligned-vec", - "arbitrary", - "arg_enum_proc_macro", - "arrayvec", - "av-scenechange", - "av1-grain", - "bitstream-io", - "built", - "cfg-if", - "interpolate_name", - "itertools 0.14.0", - "libc", - "libfuzzer-sys", - "log", - "maybe-rayon", - "new_debug_unreachable", - "noop_proc_macro", - "num-derive", - "num-traits", - "paste", - "profiling", - "rand 0.9.2", - "rand_chacha 0.9.0", - "simd_helpers", - "thiserror 2.0.18", - "v_frame", - "wasm-bindgen", -] - -[[package]] -name = "ravif" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e52310197d971b0f5be7fe6b57530dcd27beb35c1b013f29d66c1ad73fbbcc45" -dependencies = [ - "avif-serialize", - "imgref", - "loop9", - "quick-error 2.0.1", - "rav1e", - "rayon", - "rgb", -] - -[[package]] -name = "rayon" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" -dependencies = [ - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" -dependencies = [ - "crossbeam-deque", - "crossbeam-utils", -] - [[package]] name = "redox_syscall" version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", ] [[package]] @@ -5211,7 +4753,7 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", ] [[package]] @@ -5305,7 +4847,7 @@ dependencies = [ "http-body 1.0.1", "http-body-util", "hyper 1.9.0", - "hyper-rustls 0.27.7", + "hyper-rustls 0.27.9", "hyper-util", "js-sys", "log", @@ -5313,7 +4855,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.37", + "rustls 0.23.38", "rustls-native-certs", "rustls-pki-types", "serde", @@ -5331,7 +4873,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots 1.0.6", + "webpki-roots 1.0.7", ] [[package]] @@ -5350,12 +4892,6 @@ dependencies = [ "thiserror 1.0.69", ] -[[package]] -name = "rgb" -version = "0.8.53" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b34b781b31e5d73e9fbc8689c70551fd1ade9a19e3e28cfec8580a79290cc4" - [[package]] name = "rhai" version = "1.24.0" @@ -5363,7 +4899,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f9ef5dabe4c0b43d8f1187dc6beb67b53fe607fff7e30c5eb7f71b814b8c2c1" dependencies = [ "ahash", - "bitflags 2.11.0", + "bitflags 2.11.1", "no-std-compat", "num-traits", "once_cell", @@ -5406,7 +4942,7 @@ version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "fallible-iterator", "fallible-streaming-iterator", "hashlink", @@ -5451,7 +4987,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "errno", "libc", "linux-raw-sys", @@ -5472,16 +5008,16 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.37" +version = "0.23.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +checksum = "69f9466fb2c14ea04357e91413efb882e2a6d4a406e625449bc0a5d360d53a21" dependencies = [ "aws-lc-rs", "log", "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.103.12", + "rustls-webpki 0.103.13", "subtle", "zeroize", ] @@ -5529,9 +5065,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.12" +version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8279bb85272c9f10811ae6a6c547ff594d6a7f3c6c6b02ee9726d1d0dcfcdd06" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ "aws-lc-rs", "ring", @@ -5643,7 +5179,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "core-foundation", "core-foundation-sys", "libc", @@ -5712,7 +5248,6 @@ version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ - "indexmap 2.13.1", "itoa", "memchr", "serde", @@ -5726,7 +5261,7 @@ version = "0.9.42" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e408f29489b5fd500fab51ff1484fc859bb655f32c671f307dcd733b72e8168c" dependencies = [ - "indexmap 2.13.1", + "indexmap 2.14.0", "itoa", "ryu", "serde", @@ -5786,7 +5321,7 @@ dependencies = [ "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.13.1", + "indexmap 2.14.0", "schemars 0.9.0", "schemars 1.2.1", "serde_core", @@ -5796,9 +5331,9 @@ dependencies = [ [[package]] name = "serial2" -version = "0.2.35" +version = "0.2.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e66ab7ee258c6456796c6098e1b53a5baa1a5e0637347de59ddb44ee8e20be6e" +checksum = "fcdbc46aa3882ec3d48ec2b5abcb4f0d863a13d7599265f3faa6d851f23c12f3" dependencies = [ "cfg-if", "libc", @@ -5838,8 +5373,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.2.17", + "digest 0.10.7", ] [[package]] @@ -5849,8 +5384,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.2.17", + "digest 0.10.7", +] + +[[package]] +name = "sha2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "digest 0.11.2", ] [[package]] @@ -5936,15 +5482,6 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" -[[package]] -name = "simd_helpers" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6" -dependencies = [ - "quote", -] - [[package]] name = "similar" version = "2.7.0" @@ -6041,17 +5578,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "stringprep" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" -dependencies = [ - "unicode-bidi", - "unicode-normalization", - "unicode-properties", -] - [[package]] name = "strip-ansi-escapes" version = "0.2.1" @@ -6201,7 +5727,7 @@ checksum = "4676b37242ccbd1aabf56edb093a4827dc49086c0ffd764a5705899e0f35f8f7" dependencies = [ "anyhow", "base64 0.22.1", - "bitflags 2.11.0", + "bitflags 2.11.1", "fancy-regex 0.11.0", "filedescriptor", "finl_unicode", @@ -6218,7 +5744,7 @@ dependencies = [ "pest", "pest_derive", "phf 0.11.3", - "sha2", + "sha2 0.10.9", "signal-hook", "siphasher", "terminfo", @@ -6392,9 +5918,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.51.0" +version = "1.52.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bd1c4c0fc4a7ab90fc15ef6daaa3ec3b893f004f915f2392557ed23237820cd" +checksum = "b67dee974fe86fd92cc45b7a95fdd2f99a36a6d7b0d431a231178d3d670bbcc6" dependencies = [ "bytes", "libc", @@ -6454,7 +5980,7 @@ version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ - "rustls 0.23.37", + "rustls 0.23.38", "tokio", ] @@ -6489,7 +6015,7 @@ checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9" dependencies = [ "futures-util", "log", - "rustls 0.23.37", + "rustls 0.23.38", "rustls-pki-types", "tokio", "tokio-rustls 0.26.4", @@ -6538,7 +6064,7 @@ version = "0.22.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" dependencies = [ - "indexmap 2.13.1", + "indexmap 2.14.0", "serde", "serde_spanned", "toml_datetime", @@ -6574,7 +6100,7 @@ version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "bytes", "futures-util", "http 1.4.0", @@ -6697,12 +6223,6 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" -[[package]] -name = "ttf-parser" -version = "0.25.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31" - [[package]] name = "tui-textarea-2" version = "0.10.2" @@ -6729,8 +6249,8 @@ dependencies = [ "http 1.4.0", "httparse", "log", - "rand 0.8.5", - "rustls 0.23.37", + "rand 0.8.6", + "rustls 0.23.38", "rustls-pki-types", "sha1", "thiserror 1.0.69", @@ -6779,9 +6299,9 @@ dependencies = [ [[package]] name = "typenum" -version = "1.19.0" +version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" [[package]] name = "ucd-trie" @@ -6801,12 +6321,6 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" -[[package]] -name = "unicode-bidi" -version = "0.3.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" - [[package]] name = "unicode-ident" version = "1.0.24" @@ -6822,12 +6336,6 @@ dependencies = [ "tinyvec", ] -[[package]] -name = "unicode-properties" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" - [[package]] name = "unicode-segmentation" version = "1.13.2" @@ -6914,9 +6422,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.23.0" +version = "1.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" +checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" dependencies = [ "atomic", "getrandom 0.4.2", @@ -6924,17 +6432,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "v_frame" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "666b7727c8875d6ab5db9533418d7c764233ac9c0cff1d469aec8fa127597be2" -dependencies = [ - "aligned-vec", - "num-traits", - "wasm-bindgen", -] - [[package]] name = "valuable" version = "0.1.1" @@ -7013,11 +6510,11 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.2+wasi-0.2.9" +version = "1.0.3+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.57.1", ] [[package]] @@ -7026,14 +6523,14 @@ version = "0.4.0+wasi-0.3.0-rc-2026-01-06" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.51.0", ] [[package]] name = "wasm-bindgen" -version = "0.2.117" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0" +checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89" dependencies = [ "cfg-if", "once_cell", @@ -7044,9 +6541,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.67" +version = "0.4.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03623de6905b7206edd0a75f69f747f134b7f0a2323392d664448bf2d3c5d87e" +checksum = "f371d383f2fb139252e0bfac3b81b265689bf45b6874af544ffa4c975ac1ebf8" dependencies = [ "js-sys", "wasm-bindgen", @@ -7054,9 +6551,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.117" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be" +checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -7064,9 +6561,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.117" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2" +checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904" dependencies = [ "bumpalo", "proc-macro2", @@ -7077,9 +6574,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.117" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b" +checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129" dependencies = [ "unicode-ident", ] @@ -7101,7 +6598,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" dependencies = [ "anyhow", - "indexmap 2.13.1", + "indexmap 2.14.0", "wasm-encoder", "wasmparser", ] @@ -7125,17 +6622,17 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "hashbrown 0.15.5", - "indexmap 2.13.1", + "indexmap 2.14.0", "semver", ] [[package]] name = "web-sys" -version = "0.3.94" +version = "0.3.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd70027e39b12f0849461e08ffc50b9cd7688d942c1c8e3c7b22273236b4dd0a" +checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d" dependencies = [ "js-sys", "wasm-bindgen", @@ -7157,14 +6654,14 @@ version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" dependencies = [ - "webpki-roots 1.0.6", + "webpki-roots 1.0.7", ] [[package]] name = "webpki-roots" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" +checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" dependencies = [ "rustls-pki-types", ] @@ -7193,7 +6690,7 @@ checksum = "692daff6d93d94e29e4114544ef6d5c942a7ed998b37abdc19b17136ea428eb7" dependencies = [ "getrandom 0.3.4", "mac_address", - "sha2", + "sha2 0.10.9", "thiserror 1.0.69", "uuid", ] @@ -7402,6 +6899,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + [[package]] name = "windows-sys" version = "0.61.2" @@ -7435,13 +6941,30 @@ dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm", + "windows_i686_gnullvm 0.52.6", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link 0.2.1", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" @@ -7454,6 +6977,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -7466,6 +6995,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -7478,12 +7013,24 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -7496,6 +7043,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -7508,6 +7061,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" @@ -7520,6 +7079,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" @@ -7532,6 +7097,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + [[package]] name = "winnow" version = "0.7.15" @@ -7565,6 +7136,12 @@ dependencies = [ "wit-bindgen-rust-macro", ] +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + [[package]] name = "wit-bindgen-core" version = "0.51.0" @@ -7584,7 +7161,7 @@ checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ "anyhow", "heck", - "indexmap 2.13.1", + "indexmap 2.14.0", "prettyplease", "syn 2.0.117", "wasm-metadata", @@ -7614,8 +7191,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags 2.11.0", - "indexmap 2.13.1", + "bitflags 2.11.1", + "indexmap 2.14.0", "log", "serde", "serde_derive", @@ -7634,7 +7211,7 @@ checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" dependencies = [ "anyhow", "id-arena", - "indexmap 2.13.1", + "indexmap 2.14.0", "log", "semver", "serde", @@ -7723,12 +7300,6 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" -[[package]] -name = "y4m" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448" - [[package]] name = "yoke" version = "0.8.2" @@ -7843,7 +7414,7 @@ dependencies = [ "crossbeam-utils", "displaydoc", "flate2", - "indexmap 2.13.1", + "indexmap 2.14.0", "memchr", "thiserror 2.0.18", "zopfli", @@ -7901,15 +7472,6 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" -[[package]] -name = "zune-inflate" -version = "0.2.54" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02" -dependencies = [ - "simd-adler32", -] - [[package]] name = "zune-jpeg" version = "0.5.15" diff --git a/Cargo.toml b/Cargo.toml index f89ee20..45d733b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ members = [ "sdks/nodejs-native", ] [workspace.package] -version = "0.8.0" +version = "0.9.0" edition = "2024" rust-version = "1.95.0" license = "Apache-2.0" @@ -31,21 +31,21 @@ authors = ["Raphael Mansuy"] [workspace.dependencies] # Internal crates -edgecrab-command-catalog = { path = "crates/edgecrab-command-catalog", version = "0.8.0" } -edgecrab-types = { path = "crates/edgecrab-types", version = "0.8.0" } -edgecrab-security = { path = "crates/edgecrab-security", version = "0.8.0" } -edgecrab-state = { path = "crates/edgecrab-state", version = "0.8.0" } -edgecrab-plugins = { path = "crates/edgecrab-plugins", version = "0.8.0" } -edgecrab-cron = { path = "crates/edgecrab-cron", version = "0.8.0" } -edgecrab-lsp = { path = "crates/edgecrab-lsp", version = "0.8.0" } -edgecrab-tools = { path = "crates/edgecrab-tools", version = "0.8.0" } -edgecrab-core = { path = "crates/edgecrab-core", version = "0.8.0" } -edgecrab-gateway = { path = "crates/edgecrab-gateway", version = "0.8.0" } -edgecrab-acp = { path = "crates/edgecrab-acp", version = "0.8.0" } -edgecrab-migrate = { path = "crates/edgecrab-migrate", version = "0.8.0" } -edgecrab-sdk-core = { path = "crates/edgecrab-sdk-core", version = "0.8.0" } -edgecrab-sdk-macros = { path = "crates/edgecrab-sdk-macros", version = "0.8.0" } -edgecrab-sdk = { path = "crates/edgecrab-sdk", version = "0.8.0" } +edgecrab-command-catalog = { path = "crates/edgecrab-command-catalog", version = "0.9.0" } +edgecrab-types = { path = "crates/edgecrab-types", version = "0.9.0" } +edgecrab-security = { path = "crates/edgecrab-security", version = "0.9.0" } +edgecrab-state = { path = "crates/edgecrab-state", version = "0.9.0" } +edgecrab-plugins = { path = "crates/edgecrab-plugins", version = "0.9.0" } +edgecrab-cron = { path = "crates/edgecrab-cron", version = "0.9.0" } +edgecrab-lsp = { path = "crates/edgecrab-lsp", version = "0.9.0" } +edgecrab-tools = { path = "crates/edgecrab-tools", version = "0.9.0" } +edgecrab-core = { path = "crates/edgecrab-core", version = "0.9.0" } +edgecrab-gateway = { path = "crates/edgecrab-gateway", version = "0.9.0" } +edgecrab-acp = { path = "crates/edgecrab-acp", version = "0.9.0" } +edgecrab-migrate = { path = "crates/edgecrab-migrate", version = "0.9.0" } +edgecrab-sdk-core = { path = "crates/edgecrab-sdk-core", version = "0.9.0" } +edgecrab-sdk-macros = { path = "crates/edgecrab-sdk-macros", version = "0.9.0" } +edgecrab-sdk = { path = "crates/edgecrab-sdk", version = "0.9.0" } # Shared across workspace — single version for consistency serde = { version = "1", features = ["derive"] } diff --git a/README.md b/README.md index 68a5253..7bb9324 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ EdgeCrab is a **SuperAgent** — a personal assistant and coding agent forged in Rust. It carries the soul of **Nous Hermes Agent** (autonomous reasoning, persistent memory, user-first alignment) and the always-on presence of **OpenClaw** (17 messaging gateways, smart-home integration), packaged as a stripped native release binary of about **49 MB** on current macOS arm64 builds, with zero Python or Node.js runtime dependencies. Runs on Linux, macOS, and Android (Termux). -> **Latest release: v0.8.0** — mission steering in the TUI and gateway, prompt-cache-aware system prompt blocks, accessibility-tuned terminal colors, and the published `edgequake-llm 0.6.12` runtime handoff. +> **Latest release: v0.9.0** — an opt-in Shadow Judge completion oracle, structured `report_task_status` harness signals, and tighter anti-loop tool contracts that help EdgeCrab recover from premature stops and repeated malformed tool retries. ## Architecture diff --git a/crates/edgecrab-cli/src/app.rs b/crates/edgecrab-cli/src/app.rs index c9ede71..c350fa7 100644 --- a/crates/edgecrab-cli/src/app.rs +++ b/crates/edgecrab-cli/src/app.rs @@ -519,9 +519,13 @@ async fn forward_stream_event_to_tui( match event { StreamEvent::Token(text) => { - *saw_token_event = true; - tracing::info!(len = text.len(), "TUI→agent: forwarding token"); - let _ = tx.send(AgentResponse::Token(text)); + if text.is_empty() { + tracing::debug!("TUI→agent: dropping empty token delta"); + } else { + *saw_token_event = true; + tracing::info!(len = text.len(), "TUI→agent: forwarding token"); + let _ = tx.send(AgentResponse::Token(text)); + } } StreamEvent::Reasoning(text) => { *saw_reasoning_event = true; @@ -2389,12 +2393,13 @@ fn wait_urgency_color(elapsed_secs: u64) -> Color { fn format_waiting_first_token_status( theme: &Theme, + glyphs: TerminalGlyphProfile, frame_idx: usize, verb_idx: usize, face_idx: usize, elapsed_secs: u64, ) -> String { - let spinner = SPINNER_FRAMES[frame_idx % SPINNER_FRAMES.len()]; + let spinner = compact_spinner_frame(frame_idx, glyphs); let verb = if theme.waiting_verbs.is_empty() { "awaiting" } else { @@ -2415,12 +2420,13 @@ fn format_waiting_first_token_status( fn format_thinking_status( theme: &Theme, + glyphs: TerminalGlyphProfile, frame_idx: usize, verb_idx: usize, face_idx: usize, elapsed_secs: u64, ) -> String { - let spinner = SPINNER_FRAMES[frame_idx % SPINNER_FRAMES.len()]; + let spinner = compact_spinner_frame(frame_idx, glyphs); let verb = if theme.thinking_verbs.is_empty() { "thinking" } else { @@ -4705,6 +4711,8 @@ pub struct App { show_status_bar: bool, /// Whether dangerous command approvals are bypassed for the current session. yolo_enabled: bool, + /// Whether the Shadow Judge completion oracle is active for this session. + shadow_judge_enabled: bool, /// Queued prompts to run after the current one completes prompt_queue: Vec, /// Display state machine (spinner animation) @@ -4833,6 +4841,16 @@ pub struct App { statusbar_selector_active: bool, /// Which row is highlighted in the statusbar picker (0=Visible, 1=Hidden) statusbar_selector_cursor: usize, + /// Shadow Judge picker overlay (activated by `/shadow-judge` with no args) + shadow_judge_selector_active: bool, + /// Which row is highlighted in the shadow judge picker (0=ON, 1=OFF) + shadow_judge_selector_cursor: usize, + /// Timestamp for the most recent Shadow Judge intervention notice. + shadow_judge_intervention_at: Option, + /// Latest Shadow Judge intervention summary line. + shadow_judge_intervention_text: Option, + /// Latest Shadow Judge intervention confidence (0.0..1.0). + shadow_judge_intervention_confidence: Option, /// Cached skill names (without leading /) for completion suggestions skills_completion_names: Vec, /// Skills currently activated for injection into agent prompts. @@ -5704,6 +5722,7 @@ impl App { tool_progress_mode: display_preferences.tool_progress_mode, show_status_bar: display_preferences.show_status_bar, yolo_enabled: false, + shadow_judge_enabled: runtime_config.shadow_judge.enabled, prompt_queue: Vec::new(), display_state: DisplayState::Idle, completion: CompletionState { @@ -5772,6 +5791,11 @@ impl App { stream_selector_cursor: 0, // default to ON statusbar_selector_active: false, statusbar_selector_cursor: 0, // default to Visible + shadow_judge_selector_active: false, + shadow_judge_selector_cursor: 0, + shadow_judge_intervention_at: None, + shadow_judge_intervention_text: None, + shadow_judge_intervention_confidence: None, skills_completion_names: Vec::new(), active_skills: Vec::new(), last_terminal_width: 80, @@ -9179,6 +9203,7 @@ impl App { self.personality_selector_active = false; self.stream_selector_active = false; self.statusbar_selector_active = false; + self.shadow_judge_selector_active = false; self.needs_redraw = true; let now = Instant::now(); let is_double = self @@ -9579,6 +9604,32 @@ impl App { return; } + // Shadow Judge picker overlay active — intercept all keys + if self.shadow_judge_selector_active { + match key.code { + KeyCode::Esc => { + self.shadow_judge_selector_active = false; + self.needs_redraw = true; + } + KeyCode::Up | KeyCode::BackTab | KeyCode::Down | KeyCode::Tab => { + self.shadow_judge_selector_cursor = 1 - self.shadow_judge_selector_cursor; + self.needs_redraw = true; + } + KeyCode::Enter => { + let arg = if self.shadow_judge_selector_cursor == 0 { + "on" + } else { + "off" + }; + self.shadow_judge_selector_active = false; + self.handle_set_shadow_judge(arg.to_string()); + self.needs_redraw = true; + } + _ => {} + } + return; + } + // Verbose / tool-progress picker overlay active — intercept all keys if self.verbose_selector_active { match key.code { @@ -12296,6 +12347,16 @@ impl App { CommandResult::SetYolo(mode) => { self.handle_set_yolo(mode); } + CommandResult::SetShadowJudge(mode) => { + if mode.trim().is_empty() { + self.shadow_judge_selector_cursor = + if self.shadow_judge_enabled { 0 } else { 1 }; + self.shadow_judge_selector_active = true; + self.needs_redraw = true; + } else { + self.handle_set_shadow_judge(mode); + } + } CommandResult::ApprovalChoice(choice) => { self.handle_approval_choice_command(choice); } @@ -12781,6 +12842,11 @@ impl App { name, args_json, } => { + if name == "shadow_judge" { + self.handle_shadow_judge_intervention_notice(&args_json); + self.needs_redraw = true; + continue; + } self.flush_buffered_assistant_output(); // CRITICAL: Break the streaming buffer at the tool boundary. // Without this, tokens arriving after the tool call append to @@ -17615,6 +17681,106 @@ impl App { .and_then(|snap| snap.session_id) } + fn handle_shadow_judge_intervention_notice(&mut self, args_json: &str) { + let parsed: serde_json::Value = match serde_json::from_str(args_json) { + Ok(v) => v, + Err(_) => { + self.push_output( + "🧭 Shadow Judge intervened and requested continuation.", + OutputRole::System, + ); + self.shadow_judge_intervention_at = Some(Instant::now()); + self.shadow_judge_intervention_text = Some("requested continuation".to_string()); + self.shadow_judge_intervention_confidence = None; + return; + } + }; + + let verdict = parsed + .get("verdict") + .and_then(|v| v.as_str()) + .unwrap_or("incomplete"); + let reason = parsed + .get("reason") + .and_then(|v| v.as_str()) + .unwrap_or("task appears incomplete"); + let confidence = parsed + .get("confidence") + .and_then(|v| v.as_f64()) + .map(|v| v as f32) + .unwrap_or(0.0); + + if verdict == "incomplete" { + let reason_short = edgecrab_core::safe_truncate(reason, 120); + let pct = (confidence * 100.0).clamp(0.0, 100.0); + self.push_output( + format!( + "🧭 Shadow Judge intervention ({pct:.0}% confidence): {reason_short}. Continuing automatically." + ), + OutputRole::System, + ); + self.shadow_judge_intervention_at = Some(Instant::now()); + self.shadow_judge_intervention_text = Some(reason_short.to_string()); + self.shadow_judge_intervention_confidence = Some(confidence); + } + } + + fn handle_set_shadow_judge(&mut self, mode: String) { + let action = mode.trim().to_ascii_lowercase(); + match action.as_str() { + "" | "toggle" => { + self.shadow_judge_enabled = !self.shadow_judge_enabled; + } + "on" | "enable" | "enabled" => { + self.shadow_judge_enabled = true; + } + "off" | "disable" | "disabled" => { + self.shadow_judge_enabled = false; + } + "status" => { + self.push_output( + format!( + "Shadow Judge is {} for this session.", + if self.shadow_judge_enabled { + "ON" + } else { + "OFF" + } + ), + OutputRole::System, + ); + return; + } + other => { + self.push_output( + format!( + "Unknown argument '{other}'. Use: /shadow-judge [on|off|toggle|status]" + ), + OutputRole::System, + ); + return; + } + } + // Propagate the change to the live agent so the next turn picks it up. + if let Some(agent) = self.agent.clone() { + let enabled = self.shadow_judge_enabled; + self.rt_handle.spawn(async move { + agent.set_shadow_judge_enabled(enabled).await; + }); + } + self.push_output( + format!( + "Shadow Judge {} for this session.", + if self.shadow_judge_enabled { + "enabled \u{2014} the completion oracle will verify task completion" + } else { + "disabled" + } + ), + OutputRole::System, + ); + } + fn handle_set_yolo(&mut self, mode: String) { let Some(session_key) = self.current_session_key() else { self.push_output( @@ -22114,6 +22280,11 @@ impl App { self.render_statusbar_selector(frame, frame.area()); } + // Shadow Judge picker overlay (compact centered popup) + if self.shadow_judge_selector_active { + self.render_shadow_judge_selector(frame, frame.area()); + } + // Steering overlay (compact floating panel — lower screen half) if self.steering_overlay_active { self.render_steering_overlay(frame, frame.area()); @@ -22228,7 +22399,7 @@ impl App { // status bar. The ghost line disappears naturally once real tokens arrive. match &self.display_state { DisplayState::AwaitingFirstToken { frame, started } => { - let spinner = SPINNER_FRAMES[*frame % SPINNER_FRAMES.len()]; + let spinner = compact_spinner_frame(*frame, self.terminal_glyph_profile); let elapsed = started.elapsed().as_secs(); let ghost_text: String = if elapsed > 10 { format!(" {spinner} awaiting response\u{2026} {elapsed}s (^C to stop)") @@ -22260,7 +22431,7 @@ impl App { // reasoning_line is Some, the user already sees live reasoning // text — adding a ghost line would duplicate the signal. if self.reasoning_line.is_none() => { - let spinner = SPINNER_FRAMES[*frame % SPINNER_FRAMES.len()]; + let spinner = compact_spinner_frame(*frame, self.terminal_glyph_profile); let elapsed = started.elapsed().as_secs(); let ghost_text: String = if elapsed > 3 { format!(" {spinner} thinking\u{2026} {elapsed}s") @@ -22464,7 +22635,7 @@ impl App { // ── Ghost waiting line (FP45) compact variant ───────────────── match &self.display_state { DisplayState::AwaitingFirstToken { frame, started } => { - let spinner = SPINNER_FRAMES[*frame % SPINNER_FRAMES.len()]; + let spinner = compact_spinner_frame(*frame, glyphs); let elapsed = started.elapsed().as_secs(); let ghost: String = if elapsed > 3 { format!(" {spinner} awaiting\u{2026} {elapsed}s") @@ -22480,7 +22651,7 @@ impl App { ))); } DisplayState::Thinking { frame, started } if self.reasoning_line.is_none() => { - let spinner = SPINNER_FRAMES[*frame % SPINNER_FRAMES.len()]; + let spinner = compact_spinner_frame(*frame, glyphs); let elapsed = started.elapsed().as_secs(); let ghost: String = if elapsed > 3 { format!(" {spinner} thinking\u{2026} {elapsed}s") @@ -22607,6 +22778,7 @@ impl App { let elapsed_secs = started.elapsed().as_secs(); let msg = format_waiting_first_token_status( &self.theme, + self.terminal_glyph_profile, *f, self.thinking_verb_idx, self.kaomoji_frame_idx, @@ -22620,6 +22792,7 @@ impl App { let elapsed_secs = started.elapsed().as_secs(); let msg = format_thinking_status( &self.theme, + self.terminal_glyph_profile, *f, self.thinking_verb_idx, self.kaomoji_frame_idx, @@ -22671,7 +22844,7 @@ impl App { started, .. } => { - let spinner = SPINNER_FRAMES[*f % SPINNER_FRAMES.len()]; + let spinner = compact_spinner_frame(*f, self.terminal_glyph_profile); let summary = summarize_active_tools(&self.active_tools); let elapsed_secs = summary .as_ref() @@ -22726,7 +22899,7 @@ impl App { frame: f, started, } => { - let spinner = SPINNER_FRAMES[*f % SPINNER_FRAMES.len()]; + let spinner = compact_spinner_frame(*f, self.terminal_glyph_profile); let elapsed = started.elapsed().as_secs(); let msg = if elapsed > 3 { format!(" {spinner} {label} {elapsed}s ") @@ -22974,6 +23147,50 @@ impl App { )); } + // ── Shadow Judge indicator ──────────────────────────────────────── + // Show a compact " SJ " badge when the completion oracle is active. + if self.shadow_judge_enabled { + left_spans.push(Span::styled( + " │ ", + Style::default().fg(Color::Rgb(50, 50, 65)), + )); + left_spans.push(Span::styled( + " SJ ", + Style::default() + .fg(Color::Rgb(18, 32, 26)) + .bg(Color::Rgb(130, 200, 255)) + .add_modifier(Modifier::BOLD), + )); + } + if self + .shadow_judge_intervention_at + .is_some_and(|t| t.elapsed() < std::time::Duration::from_secs(10)) + { + let confidence = self + .shadow_judge_intervention_confidence + .map(|c| (c * 100.0).clamp(0.0, 100.0)); + let reason = self + .shadow_judge_intervention_text + .as_deref() + .map(|text| edgecrab_core::safe_truncate(text, 42).to_string()) + .unwrap_or_else(|| "continuation requested".to_string()); + left_spans.push(Span::styled( + " │ ", + Style::default().fg(Color::Rgb(50, 50, 65)), + )); + left_spans.push(Span::styled( + if let Some(conf) = confidence { + format!(" SJ veto {conf:.0}%: {reason} ") + } else { + format!(" SJ veto: {reason} ") + }, + Style::default() + .fg(Color::Rgb(30, 22, 8)) + .bg(Color::Rgb(255, 200, 90)) + .add_modifier(Modifier::BOLD), + )); + } + // Right side: keyboard hints + turn counter let mut right_spans = Vec::new(); if self.turn_count > 0 { @@ -23217,7 +23434,7 @@ impl App { } ); let left = format!( - "{}{}{}{}{}{}${:.4}{}{}{}{}", + "{}{}{}{}{}{}${:.4}{}{}{}{}{}{}", state, divider, edgecrab_core::safe_truncate(&self.model_name, 18), @@ -23229,6 +23446,19 @@ impl App { transport, divider, profile, + if self.shadow_judge_enabled { + " | SJ" + } else { + "" + }, + if self + .shadow_judge_intervention_at + .is_some_and(|t| t.elapsed() < std::time::Duration::from_secs(10)) + { + " | SJ veto" + } else { + "" + }, ); let right_width = right.width().min(area.width as usize) as u16; let left_area = Rect { @@ -29114,6 +29344,131 @@ impl App { frame.render_widget(Paragraph::new(picker_help_line(accent)), chunks[2]); } + /// Render the shadow judge picker (2 options: on / off). + fn render_shadow_judge_selector(&self, frame: &mut Frame, area: Rect) { + let popup = popup_rect(area, 74, 18); + frame.render_widget(Clear, popup); + let chunks = picker_three_layout(popup); + let body = picker_two_cols(chunks[1], 42); + + const ENTRIES: [(&str, &str, &str); 2] = [ + ( + "on", + "ON", + "Run the completion oracle before finalizing; vetoes likely-incomplete stops.", + ), + ( + "off", + "OFF", + "Skip completion verification and trust the normal completion policy only.", + ), + ]; + + let accent = Color::Rgb(130, 200, 255); + let cursor = self.shadow_judge_selector_cursor; + let cur_label = if self.shadow_judge_enabled { + "on" + } else { + "off" + }; + + let header = Paragraph::new(Line::from(vec![ + Span::styled(" ◈ ", Style::default().fg(accent)), + Span::styled( + "Shadow Judge", + Style::default() + .fg(Color::Rgb(210, 235, 255)) + .add_modifier(Modifier::BOLD), + ), + Span::raw(" "), + Span::styled( + format!("current: {}", cur_label.to_uppercase()), + Style::default().fg(Color::Rgb(145, 190, 230)), + ), + ])) + .block( + Block::default() + .borders(Borders::ALL) + .border_style(Style::default().fg(Color::Rgb(90, 145, 195))) + .title(" /shadow-judge "), + ); + frame.render_widget(header, chunks[0]); + + let items: Vec = ENTRIES + .iter() + .enumerate() + .map(|(i, (key, label, _))| { + let is_cursor = i == cursor; + let is_active = *key == cur_label; + let bg = if is_cursor { + Color::Rgb(28, 52, 74) + } else { + Color::Reset + }; + let fg = if is_cursor { + Color::White + } else { + Color::Rgb(185, 215, 240) + }; + ListItem::new(Line::from(vec![ + selector_marker(is_cursor, accent, Some(bg)), + Span::styled( + format!(" {label:<4}", label = *label), + Style::default().fg(fg).bg(bg), + ), + Span::styled( + if is_active { " ✓" } else { "" }, + Style::default().fg(Color::Rgb(105, 210, 125)).bg(bg), + ), + ])) + }) + .collect(); + frame.render_widget( + List::new(items).block(Block::default().borders(Borders::LEFT | Borders::RIGHT)), + body[0], + ); + + let (key, label, desc) = ENTRIES[cursor]; + let is_active_cur = key == cur_label; + let action_hint = if is_active_cur { + "Already active" + } else { + "Press Enter to apply" + }; + let detail = Paragraph::new(vec![ + Line::from(""), + Line::from(Span::styled( + format!(" {label}"), + Style::default() + .fg(Color::Rgb(210, 235, 255)) + .add_modifier(Modifier::BOLD), + )), + Line::from(""), + Line::from(Span::styled( + format!(" {desc}"), + Style::default().fg(Color::Rgb(170, 205, 232)), + )), + Line::from(""), + Line::from(Span::styled( + format!(" {action_hint}"), + Style::default().fg(if is_active_cur { + Color::Rgb(100, 200, 100) + } else { + Color::Rgb(220, 180, 80) + }), + )), + ]) + .wrap(Wrap { trim: true }) + .block( + Block::default() + .borders(Borders::ALL) + .border_style(Style::default().fg(Color::Rgb(65, 105, 145))), + ); + frame.render_widget(detail, body[1]); + + frame.render_widget(Paragraph::new(picker_help_line(accent)), chunks[2]); + } + /// Render the compact mission-steering overlay. /// /// The overlay appears as a small floating panel over the lower-half of the @@ -29346,9 +29701,10 @@ impl App { // feedback appear immediately on the current frame. let text = self.textarea_text(); let block = if self.is_processing { - // FP53: Animate the waiting title using the same braille spinner frame + // FP53: Animate the waiting title using the same spinner frame // as the status bar — zero extra state, perfect sync. - let spinner = SPINNER_FRAMES[self.current_spinner_frame() % SPINNER_FRAMES.len()]; + let spinner = + compact_spinner_frame(self.current_spinner_frame(), self.terminal_glyph_profile); let waiting_label = format!("{spinner} waiting…"); Block::default() .borders(Borders::ALL) @@ -32915,13 +33271,23 @@ kind = "skill" #[test] fn waiting_first_token_status_surfaces_the_right_message() { let theme = Theme::default(); - let early = format_waiting_first_token_status(&theme, 0, 0, 0, 2); - let long = format_waiting_first_token_status(&theme, 0, 0, 0, 12); + let early = + format_waiting_first_token_status(&theme, TerminalGlyphProfile::Unicode, 0, 0, 0, 2); + let long = + format_waiting_first_token_status(&theme, TerminalGlyphProfile::Unicode, 0, 0, 0, 12); assert!(early.contains("first token")); assert!(long.contains("waiting for first token")); assert!(long.contains("^C=stop")); } + #[test] + fn waiting_first_token_status_uses_ascii_spinner_when_requested() { + let theme = Theme::default(); + let status = + format_waiting_first_token_status(&theme, TerminalGlyphProfile::Ascii, 0, 0, 0, 2); + assert!(status.starts_with("- ")); + } + #[test] fn voice_presence_badges_cover_recording_and_playback_modes() { let recording = format_voice_presence_badge( diff --git a/crates/edgecrab-cli/src/commands.rs b/crates/edgecrab-cli/src/commands.rs index ddbdbe3..0a89b3c 100644 --- a/crates/edgecrab-cli/src/commands.rs +++ b/crates/edgecrab-cli/src/commands.rs @@ -214,6 +214,8 @@ pub enum CommandResult { MouseMode(String), /// Toggle or inspect YOLO approval bypass for the current session. SetYolo(String), + /// Toggle the Shadow Judge completion oracle (on/off/toggle/status). + SetShadowJudge(String), /// Resolve the current approval prompt from a slash command. ApprovalChoice(edgecrab_core::ApprovalChoice), /// macOS permission diagnostics and bootstrap workflow. @@ -1270,6 +1272,13 @@ impl CommandRegistry { handler: |args| CommandResult::SetYolo(args.trim().to_string()), }); + self.register(Command { + name: "shadow-judge", + aliases: &["sj", "shadow_judge"], + description: "Toggle shadow judge completion oracle (on/off/toggle/status)", + handler: |args| CommandResult::SetShadowJudge(args.trim().to_string()), + }); + // ── Scheduling ──────────────────────────────────────────────── self.register(Command { diff --git a/crates/edgecrab-core/src/agent.rs b/crates/edgecrab-core/src/agent.rs index dfb1c44..9535931 100644 --- a/crates/edgecrab-core/src/agent.rs +++ b/crates/edgecrab-core/src/agent.rs @@ -188,6 +188,8 @@ pub struct AgentConfig { pub compression: crate::config::CompressionConfig, /// Auxiliary side-task routing (vision, compression, other helper calls). pub auxiliary: crate::config::AuxiliaryConfig, + /// Shadow judge configuration (completion oracle). + pub shadow_judge: crate::config::ShadowJudgeConfig, /// Default Mixture-of-Agents roster and aggregator. pub moa: crate::config::MoaConfig, /// Voice output configuration projected from AppConfig. @@ -260,6 +262,7 @@ impl Default for AgentConfig { edgecrab_tools::tools::backends::SingularityBackendConfig::default(), compression: crate::config::CompressionConfig::default(), auxiliary: crate::config::AuxiliaryConfig::default(), + shadow_judge: crate::config::ShadowJudgeConfig::default(), moa: crate::config::MoaConfig::default(), tts: crate::config::TtsConfig::default(), stt: crate::config::SttConfig::default(), @@ -601,6 +604,19 @@ impl Agent { *self.gateway_sender.write().await = Some(sender); } + /// Toggle the shadow judge completion oracle for this session. + /// + /// Safe to call while a conversation is running — the value is read at + /// the next `LoopAction::Done` decision point, not during the LLM call. + pub async fn set_shadow_judge_enabled(&self, enabled: bool) { + self.config.write().await.shadow_judge.enabled = enabled; + } + + /// Returns the current shadow judge enabled state. + pub async fn shadow_judge_enabled(&self) -> bool { + self.config.read().await.shadow_judge.enabled + } + /// Gateway interface — send a message with origin context (platform + chat_id). /// /// Unlike `chat()`, this sets the origin so that `manage_cron_jobs` jobs @@ -1847,6 +1863,7 @@ impl AgentBuilder { terminal_singularity: config.terminal.singularity.clone(), compression: config.compression.clone(), auxiliary: config.auxiliary.clone(), + shadow_judge: config.shadow_judge.clone(), moa: config.moa.clone(), tts: config.tts.clone(), stt: config.stt.clone(), diff --git a/crates/edgecrab-core/src/completion_assessor.rs b/crates/edgecrab-core/src/completion_assessor.rs index 719c495..1bf0b18 100644 --- a/crates/edgecrab-core/src/completion_assessor.rs +++ b/crates/edgecrab-core/src/completion_assessor.rs @@ -32,6 +32,8 @@ impl CompletionPolicy for DefaultCompletionPolicy { let pending_clarification = ctx.pending_clarification || has_clarify_marker(ctx); let pending_approval = ctx.pending_approval || has_approval_marker(ctx); let verification = collect_verification_summary(ctx.messages); + let recent_tool_activity = has_recent_tool_activity(ctx.messages); + let deferred_work = recent_tool_activity && has_deferred_work_signal(ctx.final_response); let reported_progress = collect_reported_progress_state(ctx.messages); let reported_blocked = matches!( reported_progress.latest_status, @@ -81,6 +83,12 @@ impl CompletionPolicy for DefaultCompletionPolicy { ExitReason::PendingTasks, "Incomplete — progress was reported but work still remains.", ) + } else if deferred_work { + RunOutcome::new( + CompletionDecision::Incomplete, + ExitReason::PendingTasks, + "Incomplete — the assistant described a next step instead of executing it.", + ) } else if ctx.final_response.trim().is_empty() { RunOutcome::new( CompletionDecision::Failed, @@ -134,6 +142,70 @@ fn has_approval_marker(ctx: &CompletionContext<'_>) -> bool { }) } +fn has_recent_tool_activity(messages: &[Message]) -> bool { + messages + .iter() + .rev() + .take(6) + .any(|msg| msg.role == Role::Tool) +} + +fn has_deferred_work_signal(text: &str) -> bool { + if text.trim().is_empty() { + return false; + } + + let normalized = text + .to_ascii_lowercase() + .split_whitespace() + .collect::>() + .join(" "); + let window: String = normalized.chars().take(240).collect(); + + let intent_markers = [ + "let me ", + "i'll ", + "i will ", + "now i'll ", + "now i will ", + "next i'll ", + "next i will ", + "then i'll ", + "then i will ", + "i'm going to ", + "i am going to ", + ]; + let action_verbs = [ + "create", + "write", + "build", + "update", + "fix", + "run", + "retry", + "try", + "inspect", + "check", + "search", + "edit", + "patch", + "implement", + "add", + "continue", + "open", + "read", + ]; + + intent_markers.iter().any(|marker| { + window.match_indices(marker).any(|(index, _)| { + let after = &window[index + marker.len()..]; + action_verbs + .iter() + .any(|verb| after.find(verb).is_some_and(|pos| pos <= 48)) + }) + }) +} + #[derive(Debug, Default)] struct ReportedProgressState { latest_status: Option, @@ -416,6 +488,71 @@ mod tests { assert!(outcome.verification.evidence_present); } + #[test] + fn deferred_work_after_tool_activity_keeps_run_incomplete() { + let messages = vec![Message::tool_result( + "tc_1", + "write_file", + "Created empty scaffold at './game2'.", + )]; + let ctx = CompletionContext { + final_response: "I see the issue. The directory already exists. Let me try writing the file directly without creating directories first.", + messages: &messages, + interrupted: false, + budget_exhausted: false, + pending_approval: false, + pending_clarification: false, + active_todos: 0, + blocked_todos: 0, + child_runs_in_flight: 0, + }; + + let outcome = assess_completion(&ctx); + assert_eq!(outcome.state, CompletionDecision::Incomplete); + assert_eq!(outcome.exit_reason, ExitReason::PendingTasks); + } + + #[test] + fn final_answer_after_tool_activity_can_still_complete() { + let messages = vec![Message::tool_result( + "tc_1", + "write_file", + "Wrote ./game2/index.html successfully.", + )]; + let ctx = CompletionContext { + final_response: "The file is in place and the task is complete.", + messages: &messages, + interrupted: false, + budget_exhausted: false, + pending_approval: false, + pending_clarification: false, + active_todos: 0, + blocked_todos: 0, + child_runs_in_flight: 0, + }; + + let outcome = assess_completion(&ctx); + assert_eq!(outcome.state, CompletionDecision::Completed); + } + + #[test] + fn deferred_work_without_recent_tool_activity_does_not_trigger_heuristic() { + let ctx = CompletionContext { + final_response: "Let me explain the result in more detail.", + messages: &[], + interrupted: false, + budget_exhausted: false, + pending_approval: false, + pending_clarification: false, + active_todos: 0, + blocked_todos: 0, + child_runs_in_flight: 0, + }; + + let outcome = assess_completion(&ctx); + assert_eq!(outcome.state, CompletionDecision::Completed); + } + #[test] fn in_progress_report_keeps_run_incomplete() { let report = serde_json::json!({ diff --git a/crates/edgecrab-core/src/config.rs b/crates/edgecrab-core/src/config.rs index 7648511..c8d168c 100644 --- a/crates/edgecrab-core/src/config.rs +++ b/crates/edgecrab-core/src/config.rs @@ -61,6 +61,7 @@ pub struct AppConfig { pub voice: VoiceConfig, pub honcho: HonchoConfig, pub auxiliary: AuxiliaryConfig, + pub shadow_judge: ShadowJudgeConfig, pub moa: MoaConfig, pub reasoning_effort: Option, pub context: ContextConfig, @@ -1999,6 +2000,59 @@ pub struct AuxiliaryConfig { pub api_key_env: Option, } +/// Shadow judge configuration — lightweight LLM completion oracle. +/// +/// When enabled, the shadow judge fires after the synchronous +/// `DefaultCompletionPolicy` returns `Completed`. It makes a single +/// LLM classification call to verify that the original user request is +/// actually satisfied before allowing the loop to break. +/// +/// # Design constraints +/// +/// - The shadow call NEVER mutates `session.messages` (read-only borrow). +/// - The main session system prompt is NEVER rebuilt (no cache invalidation). +/// - Non-fatal: any API error falls through to normal loop termination. +/// - Opt-in only: `enabled: false` is the default (no impact on existing sessions). +/// - Per-session invocation cap prevents infinite correction spirals. +/// +/// Default: all fields produce a safe **disabled** state. +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(default)] +pub struct ShadowJudgeConfig { + /// Enable shadow judge. Default: `false` (opt-in). + pub enabled: bool, + /// Judge model (e.g. `"anthropic/claude-haiku-4-20250514"`). + /// `null` → falls back to `auxiliary.model` → then to the main model. + pub model: Option, + /// Hard cap on shadow judge invocations per session. + /// Prevents infinite correction loops. Default: `5`. + pub max_per_session: u32, + /// Confidence threshold below which the judge verdict is treated as + /// "complete" (i.e. low confidence → don't block). Range `[0.0, 1.0]`. + /// Default: `0.70`. + pub confidence_threshold: f32, + /// Number of most-recent messages to pass to the judge. + /// `0` = send all messages (caution: more tokens). Default: `20`. + pub context_messages: usize, + /// Minimum conversation length before the judge is eligible to fire. + /// Prevents trivial one-shot Q&A sessions from being judged. + /// Default: `4`. + pub min_messages_before_enable: usize, +} + +impl Default for ShadowJudgeConfig { + fn default() -> Self { + Self { + enabled: false, + model: None, + max_per_session: 5, + confidence_threshold: 0.70, + context_messages: 20, + min_messages_before_enable: 4, + } + } +} + /// Default Mixture-of-Agents configuration. /// /// These values are used when the `moa` tool is called without diff --git a/crates/edgecrab-core/src/conversation.rs b/crates/edgecrab-core/src/conversation.rs index a4ca17c..6be0fba 100644 --- a/crates/edgecrab-core/src/conversation.rs +++ b/crates/edgecrab-core/src/conversation.rs @@ -1310,6 +1310,22 @@ impl Agent { let mut compression_llm_failures: u32 = 0; const MAX_COMPRESSION_LLM_FAILURES: u32 = 3; + // ── Shadow Judge setup ──────────────────────────────────────────────── + // Clone config snapshot so the judge resolution has stable values even + // if the config is hot-swapped mid-session. + let shadow_judge_cfg = config.shadow_judge.clone(); + let mut shadow_judge_invocations: u32 = 0; + let (shadow_judge_provider, shadow_judge_model) = if shadow_judge_cfg.enabled { + crate::shadow_judge::resolve_shadow_provider_and_model( + &shadow_judge_cfg, + config.auxiliary.model.as_deref(), + effective_provider.clone(), + &config.model, + ) + } else { + (effective_provider.clone(), config.model.clone()) + }; + 'conversation_loop: loop { if tool_defs_dirty { active_tool_defs = if let Some(ref registry) = tool_registry { @@ -1917,6 +1933,80 @@ impl Agent { continue; } + // ── Shadow Judge veto ──────────────────────────────────── + // Fires only when: + // 1. Shadow judge is enabled in config. + // 2. The per-session invocation cap has not been reached. + // 3. The conversation is long enough to warrant it. + // Non-fatal: any error falls through to normal loop break. + if shadow_judge_cfg.enabled + && shadow_judge_invocations < shadow_judge_cfg.max_per_session + && session.messages.len() >= shadow_judge_cfg.min_messages_before_enable + { + shadow_judge_invocations += 1; + tracing::debug!( + invocation = shadow_judge_invocations, + max = shadow_judge_cfg.max_per_session, + messages = session.messages.len(), + model = %shadow_judge_model, + "shadow judge: invoking completion oracle" + ); + let verdict = crate::shadow_judge::run_shadow_judge( + &shadow_judge_provider, + &shadow_judge_model, + &session.messages, + &shadow_judge_cfg, + ) + .await; + + if let Some(verdict) = verdict { + // Attribute tokens to session cost regardless of verdict. + session.session_input_tokens += u64::from(verdict.input_tokens); + session.session_output_tokens += u64::from(verdict.output_tokens); + + tracing::info!( + is_complete = verdict.is_complete, + confidence = verdict.confidence, + reason = %verdict.reason, + invocation = shadow_judge_invocations, + "shadow judge: verdict" + ); + + let confidence_above_threshold = + verdict.confidence >= shadow_judge_cfg.confidence_threshold; + + if !verdict.is_complete && confidence_above_threshold { + // Veto: inject steering hint and continue the loop. + let hint = verdict.steering_hint.as_deref().unwrap_or( + "Continue working until all parts of the request are complete.", + ); + let msg = build_shadow_judge_message(hint, &verdict.reason); + tracing::info!( + hint = %hint, + "shadow judge: vetoing completion, injecting continuation nudge" + ); + if let Some(tx) = event_tx { + let _ = tx.send(crate::StreamEvent::ToolExec { + name: "shadow_judge".to_string(), + args_json: serde_json::json!({ + "verdict": "incomplete", + "confidence": verdict.confidence, + "reason": verdict.reason, + }) + .to_string(), + tool_call_id: "sj".to_string(), + }); + } + session.messages.push(Message::user(&msg)); + self.publish_session_state(&session).await; + continue 'conversation_loop; + } + // If incomplete but confidence is below threshold, or if + // verdict is complete — fall through to break normally. + } + // None = API error — non-fatal, fall through. + } + final_response = text; break; } @@ -2486,6 +2576,42 @@ fn tool_attempt_fingerprint(name: &str, args_json: &str) -> String { format!("{name}:{normalized_args}") } +fn invalid_args_missing_fields_suppression_key( + name: &str, + args_json: &str, + required_fields: &[String], +) -> Option { + let args = serde_json::from_str::(args_json).ok()?; + let obj = args.as_object()?; + + let mut missing: Vec = required_fields + .iter() + .filter(|field| !obj.contains_key(field.as_str())) + .cloned() + .collect(); + + if missing.is_empty() { + return None; + } + + missing.sort(); + Some(format!("invalid_args:{name}:missing:{}", missing.join(","))) +} + +fn invalid_args_semantic_key( + registry: &ToolRegistry, + name: &str, + args_json: &str, +) -> Option { + let required = registry.required_fields_for_tool(name)?; + invalid_args_missing_fields_suppression_key(name, args_json, &required) +} + +#[inline] +fn is_suppressed_argument_retry(payload: &ToolErrorResponse) -> bool { + payload.category == "arguments" && payload.code == "suppressed_repeated_tool_error" +} + fn suppressed_retry_response( name: &str, args_json: &str, @@ -2635,6 +2761,20 @@ fn build_completion_follow_up_message(outcome: &edgecrab_types::RunOutcome) -> S ) } +/// Build the user message injected when the Shadow Judge vetoes a "completed" verdict. +/// +/// The message is designed to be: +/// - Specific (uses the judge's `steering_hint` rather than a generic nudge). +/// - Non-repetitive with `build_completion_follow_up_message`. +/// - Clearly marked as a system injection so the model understands context. +fn build_shadow_judge_message(steering_hint: &str, reason: &str) -> String { + format!( + "[system: verification check indicates the task is not yet complete — {reason}. \ + {steering_hint} \ + Continue working and only stop once all parts of the original request are done with concrete evidence.]" + ) +} + fn summarize_tool_result_preview(name: &str, tool_result: &str, is_error: bool) -> Option { fn first_nonempty_line(text: &str) -> Option { text.lines() @@ -3845,6 +3985,7 @@ async fn process_response( // Partition tools into parallel-safe and sequential let mut parallel_tasks = tokio::task::JoinSet::new(); let mut sequential_calls = Vec::new(); + let mut argument_loop_blocked = false; // Track parallel tool call IDs/names so we can inject error results // for any task that panics — otherwise the assistant message has // tool_calls with no matching tool_results and the next API call fails. @@ -3975,6 +4116,11 @@ async fn process_response( error: extract_tool_error_text(&tool_result), tool_result: tool_result.clone(), }); + if let Some(payload) = parse_tool_error_response(&tool_result) + && is_suppressed_argument_retry(&payload) + { + argument_loop_blocked = true; + } failure_tracker.record_failure(&extract_tool_error_text(&tool_result)); } else { failure_tracker.record_success(); @@ -4078,6 +4224,11 @@ async fn process_response( error: extract_tool_error_text(&tool_result), tool_result: tool_result.clone(), }); + if let Some(payload) = parse_tool_error_response(&tool_result) + && is_suppressed_argument_retry(&payload) + { + argument_loop_blocked = true; + } failure_tracker.record_failure(&extract_tool_error_text(&tool_result)); } else { failure_tracker.record_success(); @@ -4092,6 +4243,12 @@ async fn process_response( session.messages.extend(injected_messages); } + if argument_loop_blocked { + session.messages.push(Message::user( + "Argument loop detected: do not retry the same malformed tool call. Read the tool error required_fields/usage_hint and either (1) provide all required JSON fields in the next tool call, or (2) ask the user for the missing value before any further tool calls.", + )); + } + // ── Consecutive failure escalation ─────────────────────────── // After all tools in this turn have run, check whether the // failure tracker has hit its threshold. If so, inject a system @@ -4333,13 +4490,23 @@ async fn dispatch_single_tool( }; let attempt_key = tool_attempt_fingerprint(name, args_json); - if let Some(prior) = dctx - .capability_suppressions - .lock() - .expect("capability suppression cache lock poisoned") - .get(&attempt_key) - .cloned() - { + let semantic_key = dctx + .registry + .as_ref() + .and_then(|reg| invalid_args_semantic_key(reg, name, args_json)); + + let prior = { + let guard = dctx + .capability_suppressions + .lock() + .expect("capability suppression cache lock poisoned"); + guard.get(&attempt_key).cloned().or_else(|| { + semantic_key + .as_ref() + .and_then(|key| guard.get(key).cloned()) + }) + }; + if let Some(prior) = prior { return ( serde_json::to_string(&suppressed_retry_response(name, args_json, &prior)) .expect("suppressed retry payload serializes"), @@ -4464,7 +4631,16 @@ async fn dispatch_single_tool( Err(ref e @ ToolError::InvalidArgs { .. }) => { // Enrich InvalidArgs with required_fields + usage_hint from schema. // This gives the LLM a precise corrective checklist on the next turn. - if let Some(enriched) = reg.enrich_invalid_args_error(name, e) { + if let Some(mut enriched) = reg.enrich_invalid_args_error(name, e) { + if enriched.suppression_key.is_none() + && let Some(ref required_fields) = enriched.required_fields + { + enriched.suppression_key = invalid_args_missing_fields_suppression_key( + name, + args_json, + required_fields, + ); + } serde_json::to_string(&enriched).expect("enriched error serializes") } else { e.to_llm_response() @@ -6968,6 +7144,46 @@ def register(ctx): assert!(second_payload.error.contains("same `write_file` call fail")); } + #[tokio::test] + async fn dispatch_single_tool_suppresses_semantic_invalid_argument_retry() { + let registry = Arc::new(ToolRegistry::new()); + let cancel = CancellationToken::new(); + let state_db = None; + let process_table = Arc::new(ProcessTable::new()); + let capability_suppressions = Arc::new(Mutex::new(HashMap::new())); + let dctx = make_dispatch_context_for_test( + ®istry, + &cancel, + &state_db, + &process_table, + capability_suppressions.clone(), + ); + + // write_file requires both path and content. Both calls below omit + // path, but differ in payload shape so exact-fingerprint matching alone + // would miss the loop. + let first_args = r#"{"content":"first"}"#; + let second_args = r#"{"content":"second","if_exists":"overwrite"}"#; + + let (first, first_injected) = + dispatch_single_tool("call-write-semantic-1", "write_file", first_args, &dctx).await; + assert!(first_injected.is_empty()); + let first_payload = parse_tool_error_response(&first).expect("structured error"); + assert_eq!(first_payload.code, "invalid_arguments"); + remember_tool_suppression(&capability_suppressions, "write_file", first_args, &first); + + let (second, second_injected) = + dispatch_single_tool("call-write-semantic-2", "write_file", second_args, &dctx).await; + assert!(second_injected.is_empty()); + let second_payload = parse_tool_error_response(&second).expect("structured error"); + assert_eq!(second_payload.code, "suppressed_repeated_tool_error"); + assert_eq!(second_payload.category, "arguments"); + assert!( + second_payload.error.contains("same `write_file` call fail"), + "semantic suppression should block varied malformed retries" + ); + } + // ── Cancellation ───────────────────────────────────────────────────── #[tokio::test] @@ -7143,6 +7359,18 @@ def register(ctx): assert!(resp.usage_hint.is_some()); } + #[test] + fn invalid_args_missing_fields_key_detects_missing_required_fields() { + let required = vec!["path".to_string(), "content".to_string()]; + let key = invalid_args_missing_fields_suppression_key( + "write_file", + r#"{"content":"hello"}"#, + &required, + ) + .expect("missing path should generate a semantic suppression key"); + assert_eq!(key, "invalid_args:write_file:missing:path"); + } + // ── repair_tool_call_arguments tests ───────────────────────────── #[test] fn repair_empty_string() { diff --git a/crates/edgecrab-core/src/lib.rs b/crates/edgecrab-core/src/lib.rs index 3fdf72d..3d0ff00 100644 --- a/crates/edgecrab-core/src/lib.rs +++ b/crates/edgecrab-core/src/lib.rs @@ -19,6 +19,7 @@ pub mod model_discovery; pub mod model_router; pub mod pricing; pub mod prompt_builder; +pub mod shadow_judge; pub mod steering; pub mod sub_agent_runner; pub mod tool_result_spill; diff --git a/crates/edgecrab-core/src/shadow_judge.rs b/crates/edgecrab-core/src/shadow_judge.rs new file mode 100644 index 0000000..0270e1d --- /dev/null +++ b/crates/edgecrab-core/src/shadow_judge.rs @@ -0,0 +1,391 @@ +//! # Shadow Judge — Lightweight LLM Completion Oracle +//! +//! Fires AFTER the synchronous `DefaultCompletionPolicy` returns `Completed`. +//! Makes one isolated LLM classification call to verify that the original +//! user request is fully satisfied before the main loop breaks. +//! +//! ## Session isolation guarantee +//! +//! `run_shadow_judge()` borrows `messages` immutably and NEVER writes back. +//! The caller in `conversation.rs` is the ONLY place that may push a +//! steering-hint message — and only when `is_complete == false`. +//! +//! ## Prompt-cache behaviour +//! +//! No new `cache_control` markers are written for the judge call. +//! Existing markers on the cloned message slice — written by +//! `apply_cache_control` during the main loop — are preserved and will +//! produce cache HIT tokens on Anthropic, keeping per-call cost ≈ $0.003–0.005. + +use std::sync::Arc; + +use edgecrab_types::Message; +use edgequake_llm::LLMProvider; + +use crate::config::ShadowJudgeConfig; +use crate::conversation::build_chat_messages; + +// ─── System prompt ───────────────────────────────────────────────────────────── + +const SHADOW_JUDGE_SYSTEM_PROMPT: &str = "\ +You are a task-completion oracle. Your ONLY output is a JSON object — no prose outside the JSON. + +Output schema: +{\"verdict\":\"complete\"|\"incomplete\",\"confidence\":0.0-1.0,\"reason\":\"\",\"steering_hint\":\"\"} + +Strict rules: +- \"complete\" means EVERY part of the user's original request is DONE with concrete evidence in the conversation. +- If the agent announced a future action but has not yet executed it, output \"incomplete\". +- If any explicitly requested sub-task is missing evidence of completion, output \"incomplete\". +- When uncertain, prefer \"incomplete\". +- Output ONLY the JSON object. No markdown fences. No commentary."; + +/// Final user message appended to the judge's isolated message list. +/// Never added to the main `session.messages`. +const SHADOW_JUDGE_QUERY: &str = "\ +[shadow-judge query] +Review the entire conversation above. Has the agent's most recent response fully \ +completed the original user request? Check every sub-goal explicitly. If any sub-goal \ +was promised or implied but not yet evidenced with tool output or concrete content, \ +output \"incomplete\". Output the JSON verdict now."; + +// ─── Public types ─────────────────────────────────────────────────────────────── + +/// Structured verdict from the shadow judge. +#[derive(Debug, Clone)] +pub struct ShadowVerdict { + /// `true` if the judge considers the task fully complete. + pub is_complete: bool, + /// Judge confidence in the verdict. Range `[0.0, 1.0]`. + pub confidence: f32, + /// One-sentence reason for the verdict. + pub reason: String, + /// Specific next action the agent should take, if incomplete. + pub steering_hint: Option, + /// Input tokens consumed by this judge call (for session cost accounting). + pub input_tokens: u32, + /// Output tokens consumed by this judge call. + pub output_tokens: u32, +} + +// ─── Public API ──────────────────────────────────────────────────────────────── + +/// Run a single shadow judge classification call. +/// +/// Returns `None` on API failure or JSON parse failure — both are non-fatal. +/// The caller falls back to the synchronous assessor's verdict. +/// +/// Returns `Some(ShadowVerdict)` on success. The caller checks +/// `verdict.is_complete` and `verdict.confidence` before acting. +/// +/// # Session Isolation +/// +/// This function takes a shared reference to `messages` and NEVER mutates it. +/// It constructs its own ephemeral `Vec` for the judge call and +/// discards that list on return. +pub async fn run_shadow_judge( + provider: &Arc, + model: &str, + messages: &[Message], + config: &ShadowJudgeConfig, +) -> Option { + // Minimum session length guard — skip trivial Q&A sessions. + if messages.len() < config.min_messages_before_enable { + tracing::debug!( + msg_count = messages.len(), + min = config.min_messages_before_enable, + "shadow judge: skipping — session too short" + ); + return None; + } + + // Trim to the most-recent `context_messages` to bound token cost. + // For very large sessions the tail is sufficient; older turns are already + // cached so we lose little context but save significant prompt tokens. + let context_slice = if config.context_messages > 0 && messages.len() > config.context_messages { + &messages[messages.len() - config.context_messages..] + } else { + messages + }; + + // Build the judge's isolated message list. + // - `SHADOW_JUDGE_SYSTEM_PROMPT`: judge's own identity (not session prompt). + // - `cache_config = None`: do NOT write new cache_control markers; existing + // markers on the slice produce server-side cache HITs for free. + let mut chat_messages = + build_chat_messages(Some(SHADOW_JUDGE_SYSTEM_PROMPT), context_slice, None); + + // Append the judge query as the final user message. + // This message is NEVER propagated to session.messages. + chat_messages.push(edgequake_llm::ChatMessage::user(SHADOW_JUDGE_QUERY)); + + // One-shot call with an empty tool list and no streaming. + // The LLMProvider default for `chat_with_tools` with an empty slice is + // equivalent to a plain `chat` call. + let response = match provider + .chat_with_tools(&chat_messages, &[], None, None) + .await + { + Ok(r) => r, + Err(e) => { + tracing::warn!( + error = %e, + model = model, + "shadow judge: API call failed (non-fatal, falling through to loop break)" + ); + return None; + } + }; + + let raw_text = response.content.trim().to_string(); + let input_tokens = response.prompt_tokens as u32; + let output_tokens = response.completion_tokens as u32; + + tracing::debug!( + raw = %raw_text, + input_tokens, + output_tokens, + "shadow judge: raw response" + ); + + parse_shadow_verdict(&raw_text, input_tokens, output_tokens) +} + +/// Resolve the `(provider, model_string)` pair to use for the shadow judge. +/// +/// Priority: +/// 1. `shadow_judge.model` (explicit override) +/// 2. `auxiliary_model` passed by caller (from `AgentConfig.auxiliary.model`) +/// 3. Fallback: `(main_provider.clone(), main_model.to_string())` +/// +/// When the chosen model string contains `/`, the prefix is treated as the +/// provider family and a new provider is created via +/// `edgecrab_tools::create_provider_for_model`. On failure, falls back to +/// the main provider with the raw model string. +pub fn resolve_shadow_provider_and_model( + shadow_cfg: &ShadowJudgeConfig, + auxiliary_model: Option<&str>, + main_provider: Arc, + main_model: &str, +) -> (Arc, String) { + let candidate = shadow_cfg + .model + .as_deref() + .or(auxiliary_model) + .map(str::trim) + .filter(|s| !s.is_empty()); + + let Some(raw_model) = candidate else { + return (main_provider, main_model.to_string()); + }; + + if let Some((provider_name, model_name)) = raw_model.split_once('/') { + let canonical = edgecrab_tools::vision_models::normalize_provider_name(provider_name); + match edgecrab_tools::create_provider_for_model(&canonical, model_name) { + Ok(p) => return (p, raw_model.to_string()), + Err(e) => { + tracing::warn!( + error = %e, + raw_model, + "shadow judge: failed to create configured provider, using main provider" + ); + } + } + } + + // Bare model name — reuse main provider credentials. + (main_provider, raw_model.to_string()) +} + +// ─── Private helpers ───────────────────────────────────────────────────────── + +/// Parse the judge's JSON verdict, tolerating markdown fences and leading prose. +/// +/// Returns `None` if JSON is malformed or `verdict` field is missing. +fn parse_shadow_verdict( + text: &str, + input_tokens: u32, + output_tokens: u32, +) -> Option { + // Strip markdown code fences that some models emit despite the prompt. + let stripped = text + .trim() + .trim_start_matches("```json") + .trim_start_matches("```") + .trim_end_matches("```") + .trim(); + + // Find JSON object boundaries in case there is surrounding prose. + let start = stripped.find('{')?; + let end = stripped.rfind('}').map(|i| i + 1)?; + let json_slice = &stripped[start..end]; + + let v: serde_json::Value = serde_json::from_str(json_slice).ok()?; + + let verdict_str = v["verdict"].as_str()?; + let is_complete = verdict_str == "complete"; + let confidence = v["confidence"].as_f64().unwrap_or(0.5) as f32; + let reason = v["reason"] + .as_str() + .unwrap_or("no reason provided") + .to_string(); + + // Treat JSON `null`, the string "null", and empty string all as None. + let steering_hint = v["steering_hint"] + .as_str() + .filter(|s| !s.is_empty() && *s != "null") + .map(str::to_string); + + Some(ShadowVerdict { + is_complete, + confidence, + reason, + steering_hint, + input_tokens, + output_tokens, + }) +} + +// ─── Tests ──────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_complete_verdict_ok() { + let json = r#"{"verdict":"complete","confidence":0.95,"reason":"All files created.","steering_hint":null}"#; + let v = + parse_shadow_verdict(json, 100, 20).expect("expected valid complete shadow verdict"); + assert!(v.is_complete); + assert!((v.confidence - 0.95).abs() < 0.01); + assert_eq!(v.reason, "All files created."); + assert!(v.steering_hint.is_none()); + assert_eq!(v.input_tokens, 100); + assert_eq!(v.output_tokens, 20); + } + + #[test] + fn parse_incomplete_verdict_with_hint() { + let json = r#"{"verdict":"incomplete","confidence":0.88,"reason":"CSS file missing.","steering_hint":"Create style.css with the game styles."}"#; + let v = parse_shadow_verdict(json, 200, 30) + .expect("expected valid incomplete shadow verdict with hint"); + assert!(!v.is_complete); + assert!((v.confidence - 0.88).abs() < 0.01); + assert!(v.steering_hint.is_some()); + assert!( + v.steering_hint + .as_deref() + .is_some_and(|hint| hint.contains("style.css")) + ); + } + + #[test] + fn parse_strips_markdown_fences() { + let json = "```json\n{\"verdict\":\"complete\",\"confidence\":0.9,\"reason\":\"done\",\"steering_hint\":null}\n```"; + let v = parse_shadow_verdict(json, 10, 5).expect("expected fenced JSON shadow verdict"); + assert!(v.is_complete); + } + + #[test] + fn parse_json_fence_no_lang_tag() { + let json = "```\n{\"verdict\":\"incomplete\",\"confidence\":0.7,\"reason\":\"not done\",\"steering_hint\":\"keep going\"}\n```"; + let v = parse_shadow_verdict(json, 0, 0) + .expect("expected fenced JSON shadow verdict without language tag"); + assert!(!v.is_complete); + } + + #[test] + fn parse_invalid_json_returns_none() { + assert!(parse_shadow_verdict("This is not JSON at all.", 0, 0).is_none()); + } + + #[test] + fn parse_missing_verdict_field_returns_none() { + let json = r#"{"confidence":0.9,"reason":"done","steering_hint":null}"#; + assert!(parse_shadow_verdict(json, 0, 0).is_none()); + } + + #[test] + fn parse_json_with_leading_prose() { + // Some models prepend a sentence despite the system prompt. + let json = r#"Here is my verdict: {"verdict":"incomplete","confidence":0.75,"reason":"JS missing.","steering_hint":"Write game.js."}"#; + let v = parse_shadow_verdict(json, 0, 0) + .expect("expected parser to recover JSON shadow verdict after prose prefix"); + assert!(!v.is_complete); + assert!(v.steering_hint.is_some()); + } + + #[test] + fn parse_null_string_steering_hint_becomes_none() { + let json = r#"{"verdict":"complete","confidence":0.99,"reason":"All done.","steering_hint":"null"}"#; + let v = parse_shadow_verdict(json, 0, 0) + .expect("expected valid shadow verdict with string null steering hint"); + assert!(v.steering_hint.is_none()); + } + + #[test] + fn parse_empty_string_steering_hint_becomes_none() { + let json = + r#"{"verdict":"complete","confidence":0.99,"reason":"All done.","steering_hint":""}"#; + let v = parse_shadow_verdict(json, 0, 0) + .expect("expected valid shadow verdict with empty steering hint"); + assert!(v.steering_hint.is_none()); + } + + #[test] + fn default_shadow_judge_config_is_disabled() { + let cfg = ShadowJudgeConfig::default(); + assert!(!cfg.enabled); + assert_eq!(cfg.max_per_session, 5); + assert!((cfg.confidence_threshold - 0.70).abs() < 0.001); + assert_eq!(cfg.context_messages, 20); + assert_eq!(cfg.min_messages_before_enable, 4); + } + + #[test] + fn resolve_no_override_returns_main_model() { + // When no shadow model and no auxiliary model are configured, + // resolve must return the main_model string unchanged and the + // same provider pointer. + use edgequake_llm::MockProvider; + use std::sync::Arc; + + let cfg = ShadowJudgeConfig::default(); + assert!(cfg.model.is_none()); + + let mock: Arc = Arc::new(MockProvider::new()); + let (returned_provider, returned_model) = resolve_shadow_provider_and_model( + &cfg, + None, + mock.clone(), + "anthropic/claude-sonnet-4", + ); + + assert_eq!(returned_model, "anthropic/claude-sonnet-4"); + assert!(Arc::ptr_eq(&returned_provider, &mock)); + } + + #[test] + fn resolve_auxiliary_model_overrides_when_no_shadow_model() { + // When `shadow_cfg.model` is None but `auxiliary_model` is provided, + // the auxiliary model string should be used as the fallback candidate. + use edgequake_llm::MockProvider; + use std::sync::Arc; + + let cfg = ShadowJudgeConfig::default(); // model: None + let mock: Arc = Arc::new(MockProvider::new()); + // "bare-model-name" has no '/' so resolve falls through to the + // (main_provider, raw_model) branch rather than creating a new provider. + let (returned_provider, returned_model) = resolve_shadow_provider_and_model( + &cfg, + Some("bare-model-name"), + mock.clone(), + "main/model", + ); + + assert_eq!(returned_model, "bare-model-name"); + // Provider should still be the main one (no '/' means no new provider is created). + assert!(Arc::ptr_eq(&returned_provider, &mock)); + } +} diff --git a/crates/edgecrab-tools/src/registry.rs b/crates/edgecrab-tools/src/registry.rs index 8e5c0d4..e525ff5 100644 --- a/crates/edgecrab-tools/src/registry.rs +++ b/crates/edgecrab-tools/src/registry.rs @@ -904,6 +904,37 @@ impl ToolRegistry { }) } + /// Required argument names declared in the tool schema. + /// + /// Returns `None` when the tool is unknown. + pub fn required_fields_for_tool(&self, name: &str) -> Option> { + let static_name = self.tool_aliases.get(name).copied().unwrap_or(name); + let schema = if let Some(handler) = self.tools.get(static_name) { + handler.schema() + } else { + let dynamic_name = self + .dynamic_tool_aliases + .get(name) + .map(String::as_str) + .unwrap_or(name); + let handler = self.dynamic_tools.get(dynamic_name)?; + handler.schema() + }; + + Some( + schema + .parameters + .get("required") + .and_then(|v| v.as_array()) + .map(|arr| { + arr.iter() + .filter_map(|v| v.as_str().map(String::from)) + .collect::>() + }) + .unwrap_or_default(), + ) + } + /// Summary of toolsets with tool counts. pub fn toolset_summary(&self) -> Vec<(String, usize)> { let mut counts: std::collections::BTreeMap = self @@ -1835,6 +1866,17 @@ mod tests { assert!(!result, "bad json should not be parallelizable"); } + #[test] + fn required_fields_for_write_file_reflect_schema() { + let reg = ToolRegistry::new(); + let required = reg + .required_fields_for_tool("write_file") + .expect("write_file should be registered"); + assert!(required.contains(&"path".to_string())); + assert!(required.contains(&"content".to_string())); + assert!(!required.contains(&"create_dirs".to_string())); + } + // ── Path-prefix overlap tests (FP9 hardening) ───────────────── #[test] diff --git a/crates/edgecrab-tools/src/tools/delegate_task.rs b/crates/edgecrab-tools/src/tools/delegate_task.rs index 0aeae38..ccf1132 100644 --- a/crates/edgecrab-tools/src/tools/delegate_task.rs +++ b/crates/edgecrab-tools/src/tools/delegate_task.rs @@ -363,7 +363,9 @@ impl ToolHandler for DelegateTaskToolReal { Each child gets its own execute_loop with tools and independent context. \ Pass all relevant paths, errors, and constraints via `context` because \ child agents do not inherit your full conversation. Use this for \ - reasoning-heavy or parallel subtasks, not for single direct tool calls." + reasoning-heavy or parallel subtasks, not for single direct tool calls. \ + Required: provide either `goal` (single task) or `tasks` (batch) — \ + calling without either returns an error." .into(), parameters: json!({ "type": "object", diff --git a/crates/edgecrab-tools/src/tools/execute_code.rs b/crates/edgecrab-tools/src/tools/execute_code.rs index 33b5fb9..a682509 100644 --- a/crates/edgecrab-tools/src/tools/execute_code.rs +++ b/crates/edgecrab-tools/src/tools/execute_code.rs @@ -1628,7 +1628,7 @@ impl ToolHandler for ExecuteCodeToolReal { "properties": { "code": { "type": "string", - "description": "Python code to execute. Import tools with `from edgecrab_tools import web_search, terminal, ...` and print your final result to stdout." + "description": "Python code to execute. Must be non-empty — only call when you already have a concrete code payload to run. Import tools with `from edgecrab_tools import web_search, terminal, ...` and print your final result to stdout." }, "language": { "type": "string", diff --git a/crates/edgecrab-tools/src/tools/file_patch.rs b/crates/edgecrab-tools/src/tools/file_patch.rs index d5e637d..577b8eb 100644 --- a/crates/edgecrab-tools/src/tools/file_patch.rs +++ b/crates/edgecrab-tools/src/tools/file_patch.rs @@ -200,11 +200,16 @@ async fn execute_replace_patch(args: ReplaceArgs, ctx: &ToolContext) -> Result, offset: usize, limit: usize) -> String { +/// Returns `(text, total_before_limit, returned_count)`. +/// +/// `total_before_limit` is the number of raw matches collected before paging. +/// `returned_count` is the number of items actually present in `text` (after +/// applying `offset` + `limit`). The caller uses these to build a machine- +/// readable `[search_result ...]` summary that lets the agent decide whether +/// to paginate rather than blindly re-running the same query. +fn format_file_results( + matches: Vec, + offset: usize, + limit: usize, +) -> (String, usize, usize) { + let total = matches.len(); let page: Vec = matches.into_iter().skip(offset).take(limit).collect(); - if page.is_empty() { + let returned = page.len(); + let text = if page.is_empty() { "No matches found.".to_string() } else { page.join("\n") - } + }; + (text, total, returned) } +/// Returns `(text, total_before_limit, returned_count)`. +/// +/// `total_before_limit` is the raw match count before paging. For +/// `files_only` and `count` modes this is the raw hit count (not the +/// deduplicated file count) — useful as a relative signal; the agent cares +/// mainly about `has_more`, which is computed from total vs offset+returned. +/// `returned_count` is the number of entries in the output text. fn format_content_results( matches: Vec<(String, usize, String, usize)>, output_mode: &str, offset: usize, limit: usize, -) -> String { +) -> (String, usize, usize) { if matches.is_empty() { - return "No matches found.".to_string(); + return ("No matches found.".to_string(), 0, 0); } + let total = matches.len(); + match output_mode { "files_only" => { let mut files = Vec::::new(); @@ -379,11 +419,13 @@ fn format_content_results( } } let page: Vec = files.into_iter().skip(offset).take(limit).collect(); - if page.is_empty() { + let returned = page.len(); + let text = if page.is_empty() { "No matches found.".to_string() } else { page.join("\n") - } + }; + (text, total, returned) } "count" => { let mut counts = std::collections::BTreeMap::::new(); @@ -396,11 +438,13 @@ fn format_content_results( .take(limit) .map(|(path, count)| format!("{path}: {count}")) .collect(); - if page.is_empty() { + let returned = page.len(); + let text = if page.is_empty() { "No matches found.".to_string() } else { page.join("\n") - } + }; + (text, total, returned) } _ => { let page: Vec = matches @@ -415,11 +459,13 @@ fn format_content_results( } }) .collect(); - if page.is_empty() { + let returned = page.len(); + let text = if page.is_empty() { "No matches found.".to_string() } else { page.join("\n") - } + }; + (text, total, returned) } } } @@ -549,4 +595,110 @@ mod tests { assert!(simple_glob_match("Makefile", "Makefile")); assert!(!simple_glob_match("Makefile", "makefile")); } + + // ── Pagination summary header tests ────────────────────────────────────── + // These tests verify the machine-readable [search_result ...] header that + // the agent uses to decide whether to paginate or stop searching. + + #[tokio::test] + async fn search_result_header_present_on_match() { + let dir = TempDir::new().expect("tmpdir"); + std::fs::write(dir.path().join("a.rs"), "needle\n").expect("w"); + + let ctx = ctx_in(dir.path()); + let result = SearchFilesTool + .execute(json!({"pattern": "needle"}), &ctx) + .await + .expect("search"); + + assert!( + result.starts_with("[search_result returned="), + "must start with machine-readable summary header; got: {result}" + ); + assert!( + result.contains("has_more=false"), + "single-page result must report has_more=false; got: {result}" + ); + } + + #[tokio::test] + async fn search_result_header_no_matches() { + let dir = TempDir::new().expect("tmpdir"); + std::fs::write(dir.path().join("test.txt"), "nothing here").expect("w"); + + let ctx = ctx_in(dir.path()); + let result = SearchFilesTool + .execute(json!({"pattern": "zzzzz_unique_zzzzz"}), &ctx) + .await + .expect("search"); + + assert!( + result.starts_with("[search_result returned=0 total=0 has_more=false]"), + "zero-match result must report returned=0 total=0; got: {result}" + ); + } + + #[tokio::test] + async fn search_result_header_has_more_with_offset() { + let dir = TempDir::new().expect("tmpdir"); + // Create 5 files that each match "needle"; limit to 2 per page. + for i in 0..5 { + std::fs::write(dir.path().join(format!("f{i}.rs")), "needle_unique_abc\n").expect("w"); + } + + let ctx = ctx_in(dir.path()); + // First page: offset=0, limit=2. + let result = SearchFilesTool + .execute( + json!({"pattern": "needle_unique_abc", "limit": 2, "offset": 0}), + &ctx, + ) + .await + .expect("search page1"); + + assert!( + result.contains("has_more=true"), + "first page of 5 matches with limit=2 must report has_more=true; got: {result}" + ); + assert!( + result.contains("next_offset=2"), + "must report next_offset=2; got: {result}" + ); + + // Second page: offset=2, limit=2 (should still have more). + let result2 = SearchFilesTool + .execute( + json!({"pattern": "needle_unique_abc", "limit": 2, "offset": 2}), + &ctx, + ) + .await + .expect("search page2"); + + assert!( + result2.contains("has_more=true"), + "second page must still report has_more=true; got: {result2}" + ); + assert!( + result2.contains("next_offset=4"), + "must report next_offset=4; got: {result2}" + ); + + // Last page: offset=4, limit=2 — only 1 item left. + let result3 = SearchFilesTool + .execute( + json!({"pattern": "needle_unique_abc", "limit": 2, "offset": 4}), + &ctx, + ) + .await + .expect("search page3"); + + assert!( + result3.contains("has_more=false"), + "last page must report has_more=false; got: {result3}" + ); + assert!( + !result3.contains("next_offset"), + "no next_offset on last page; got: {result3}" + ); + } } diff --git a/crates/edgecrab-tools/src/tools/file_write.rs b/crates/edgecrab-tools/src/tools/file_write.rs index a054036..506af35 100644 --- a/crates/edgecrab-tools/src/tools/file_write.rs +++ b/crates/edgecrab-tools/src/tools/file_write.rs @@ -101,7 +101,7 @@ impl ToolHandler for WriteFileTool { "description": "Intent when the file already exists. 'overwrite' (default): rejection on collision returns a content preview and records a session snapshot — retry the SAME call to succeed. 'abort': cheap rejection with no preview and no snapshot — use when intent is to create a NEW file and a different path should be chosen on collision." } }, - "required": ["path", "content", "create_dirs"] + "required": ["path", "content"] }), strict: Some(true), } @@ -270,11 +270,17 @@ impl ToolHandler for WriteFileTool { )) } else { // R18: Structured JSON result (FP57). + // `lines` gives the agent a measurement signal without a re-read: + // it can verify the write produced the expected number of lines and + // decide next action (patch a specific line, read a range, etc.) + // without a redundant read_file round-trip. let action = if file_exists { "overwrite" } else { "create" }; + let lines = content.lines().count(); Ok(serde_json::json!({ "ok": true, "action": action, "bytes": bytes_written, + "lines": lines, "path": args.path, }) .to_string()) @@ -310,6 +316,8 @@ mod tests { assert_eq!(v["ok"], true); assert_eq!(v["bytes"], 11); assert_eq!(v["action"], "create"); + // "hello world" has no newlines → 1 line + assert_eq!(v["lines"], 1, "lines must reflect actual line count"); let content = std::fs::read_to_string(dir.path().join("new.txt")).expect("read"); assert_eq!(content, "hello world"); } @@ -338,10 +346,7 @@ mod tests { assert_eq!(schema.strict, Some(true)); assert_eq!(schema.parameters["type"], "object"); assert_eq!(schema.parameters["additionalProperties"], false); - assert_eq!( - schema.parameters["required"], - json!(["path", "content", "create_dirs"]) - ); + assert_eq!(schema.parameters["required"], json!(["path", "content"])); // content must be "string" — NOT ["string", "null"] assert_eq!( schema.parameters["properties"]["content"]["type"], diff --git a/crates/edgecrab-tools/src/tools/memory.rs b/crates/edgecrab-tools/src/tools/memory.rs index 162efa9..2706a9f 100644 --- a/crates/edgecrab-tools/src/tools/memory.rs +++ b/crates/edgecrab-tools/src/tools/memory.rs @@ -169,7 +169,12 @@ impl ToolHandler for MemoryWriteTool { description: "Manage the agent's persistent memory. Actions: 'add' appends a new \ entry, 'replace' swaps old_content with content, 'remove' deletes \ the entry matching old_content. Hermes-compatible calls using \ - `memory` and `old_text` are also accepted." + `memory` and `old_text` are also accepted. \ + Required fields per action: \ + 'add': content must be non-empty; \ + 'replace': content (new text) AND old_content (text to find) both required; \ + 'remove': old_content (text to find) required. \ + Calling with no arguments returns the current memory contents." .into(), parameters: json!({ "type": "object", @@ -181,11 +186,11 @@ impl ToolHandler for MemoryWriteTool { }, "content": { "type": "string", - "description": "Memory entry to add, or new content for replace" + "description": "Memory entry to add, or new content for replace. Required for 'add' and 'replace' actions." }, "old_content": { "type": "string", - "description": "Substring to match for replace/remove actions" + "description": "Substring to match for replace/remove actions. Required for 'replace' and 'remove' actions." }, "old_text": { "type": "string", @@ -194,9 +199,10 @@ impl ToolHandler for MemoryWriteTool { "target": { "type": "string", "enum": ["memory", "user"], - "description": "Which memory file to write to" + "description": "Which memory file to write to (default: memory)" } - } + }, + "required": [] }), strict: None, } diff --git a/crates/edgecrab-tools/src/tools/terminal.rs b/crates/edgecrab-tools/src/tools/terminal.rs index c81a165..79aebf7 100644 --- a/crates/edgecrab-tools/src/tools/terminal.rs +++ b/crates/edgecrab-tools/src/tools/terminal.rs @@ -175,7 +175,7 @@ impl ToolHandler for TerminalTool { }, "timeout": { "type": "integer", - "description": "Timeout alias in seconds." + "description": "Alias for `timeout_seconds`. Takes precedence over `timeout_seconds` when both are set (default: 120, max: 600)." }, "pty": { "type": "boolean", @@ -356,6 +356,12 @@ impl ToolHandler for TerminalTool { // Format output (includes stdout/stderr/exit-code) let max_stdout = ctx.config.max_terminal_output; let max_stderr = ctx.config.max_terminal_output / 4; + // Compute truncation BEFORE format() discards the raw lengths. + // This gives the agent an exact, deterministic signal rather than + // requiring it to guess from "... [N bytes omitted] ..." prose. + let was_truncated = + exec_output.stdout.len() > max_stdout || exec_output.stderr.len() > max_stderr; + let total_output_chars = exec_output.stdout.len() + exec_output.stderr.len(); let mut result = exec_output.format(max_stdout, max_stderr); // Strip ANSI escape codes for clean LLM consumption. @@ -367,6 +373,21 @@ impl ToolHandler for TerminalTool { let header = terminal_result_header(&ctx.config.terminal_backend, &cwd, exec_output.exit_code); + // Augment the header with truncation info when output was cut off. + // The agent needs this signal BEFORE reading the (truncated) body so it + // knows to narrow the command (grep, head, tail) rather than blindly + // re-running and getting the same truncated result. + // Implementation: strip the closing `]` and append new key=value pairs. + // The header is always ASCII, so byte-level slicing is safe here. + let header = if was_truncated { + format!( + "{} truncated=true output_chars={}]", + &header[..header.len() - 1], + total_output_chars + ) + } else { + header + }; result = if result.is_empty() { header } else { diff --git a/crates/edgecrab-tools/src/tools/web.rs b/crates/edgecrab-tools/src/tools/web.rs index b991b00..dd00cb9 100644 --- a/crates/edgecrab-tools/src/tools/web.rs +++ b/crates/edgecrab-tools/src/tools/web.rs @@ -1600,7 +1600,7 @@ impl ToolHandler for WebSearchTool { }, "max_results": { "type": "integer", - "description": "Maximum results to return (default: 5)" + "description": "Maximum results to return (default: 5, max: 20)" }, "backend": { "type": "string", @@ -1893,7 +1893,7 @@ impl ToolHandler for WebExtractTool { fn schema(&self) -> ToolSchema { ToolSchema { name: "web_extract".into(), - description: "Extract readable content from one or more URLs. Accepts EdgeCrab's single `url` form and `urls` arrays (up to 5 URLs). Returns structured JSON with content, metadata, backend selection, PDF extraction via EdgeParse, and browser-rendered fallback for JS-heavy pages.".into(), + description: "Extract readable content from one or more URLs. Accepts EdgeCrab's single `url` form and `urls` arrays (up to 5 URLs). Returns structured JSON with content, metadata, backend selection, PDF extraction via EdgeParse, and browser-rendered fallback for JS-heavy pages. Either `url` (single) or `urls` (batch) must be provided — calling without either returns an error.".into(), parameters: json!({ "type": "object", "properties": { @@ -1919,7 +1919,8 @@ impl ToolHandler for WebExtractTool { "type": "boolean", "description": "When true (default), try a browser-rendered fallback for JS-heavy pages when native extraction is too thin" } - } + }, + "required": [] }), strict: None, } diff --git a/sdks/node/package-lock.json b/sdks/node/package-lock.json index 76077d3..ea30283 100644 --- a/sdks/node/package-lock.json +++ b/sdks/node/package-lock.json @@ -1,12 +1,12 @@ { "name": "edgecrab-sdk", - "version": "0.8.0", + "version": "0.9.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "edgecrab-sdk", - "version": "0.8.0", + "version": "0.9.0", "license": "MIT", "bin": { "edgecrab": "dist/cli.mjs" diff --git a/sdks/node/package.json b/sdks/node/package.json index 49ee706..a5a7aad 100644 --- a/sdks/node/package.json +++ b/sdks/node/package.json @@ -1,6 +1,6 @@ { "name": "edgecrab-sdk", - "version": "0.8.0", + "version": "0.9.0", "description": "Node.js SDK for EdgeCrab — a Rust-native autonomous coding agent", "main": "./dist/index.js", "module": "./dist/index.mjs", diff --git a/sdks/nodejs-native/index.js b/sdks/nodejs-native/index.js index 312f27a..dacafb2 100644 --- a/sdks/nodejs-native/index.js +++ b/sdks/nodejs-native/index.js @@ -77,8 +77,8 @@ function requireNative() { try { const binding = require('edgecrab-android-arm64') const bindingPackageVersion = require('edgecrab-android-arm64/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -93,8 +93,8 @@ function requireNative() { try { const binding = require('edgecrab-android-arm-eabi') const bindingPackageVersion = require('edgecrab-android-arm-eabi/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -114,8 +114,8 @@ function requireNative() { try { const binding = require('edgecrab-win32-x64-gnu') const bindingPackageVersion = require('edgecrab-win32-x64-gnu/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -130,8 +130,8 @@ function requireNative() { try { const binding = require('edgecrab-win32-x64-msvc') const bindingPackageVersion = require('edgecrab-win32-x64-msvc/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -147,8 +147,8 @@ function requireNative() { try { const binding = require('edgecrab-win32-ia32-msvc') const bindingPackageVersion = require('edgecrab-win32-ia32-msvc/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -163,8 +163,8 @@ function requireNative() { try { const binding = require('edgecrab-win32-arm64-msvc') const bindingPackageVersion = require('edgecrab-win32-arm64-msvc/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -182,8 +182,8 @@ function requireNative() { try { const binding = require('edgecrab-darwin-universal') const bindingPackageVersion = require('edgecrab-darwin-universal/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -198,8 +198,8 @@ function requireNative() { try { const binding = require('edgecrab-darwin-x64') const bindingPackageVersion = require('edgecrab-darwin-x64/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -214,8 +214,8 @@ function requireNative() { try { const binding = require('edgecrab-darwin-arm64') const bindingPackageVersion = require('edgecrab-darwin-arm64/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -234,8 +234,8 @@ function requireNative() { try { const binding = require('edgecrab-freebsd-x64') const bindingPackageVersion = require('edgecrab-freebsd-x64/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -250,8 +250,8 @@ function requireNative() { try { const binding = require('edgecrab-freebsd-arm64') const bindingPackageVersion = require('edgecrab-freebsd-arm64/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -271,8 +271,8 @@ function requireNative() { try { const binding = require('edgecrab-linux-x64-musl') const bindingPackageVersion = require('edgecrab-linux-x64-musl/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -287,8 +287,8 @@ function requireNative() { try { const binding = require('edgecrab-linux-x64-gnu') const bindingPackageVersion = require('edgecrab-linux-x64-gnu/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -305,8 +305,8 @@ function requireNative() { try { const binding = require('edgecrab-linux-arm64-musl') const bindingPackageVersion = require('edgecrab-linux-arm64-musl/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -321,8 +321,8 @@ function requireNative() { try { const binding = require('edgecrab-linux-arm64-gnu') const bindingPackageVersion = require('edgecrab-linux-arm64-gnu/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -339,8 +339,8 @@ function requireNative() { try { const binding = require('edgecrab-linux-arm-musleabihf') const bindingPackageVersion = require('edgecrab-linux-arm-musleabihf/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -355,8 +355,8 @@ function requireNative() { try { const binding = require('edgecrab-linux-arm-gnueabihf') const bindingPackageVersion = require('edgecrab-linux-arm-gnueabihf/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -373,8 +373,8 @@ function requireNative() { try { const binding = require('edgecrab-linux-loong64-musl') const bindingPackageVersion = require('edgecrab-linux-loong64-musl/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -389,8 +389,8 @@ function requireNative() { try { const binding = require('edgecrab-linux-loong64-gnu') const bindingPackageVersion = require('edgecrab-linux-loong64-gnu/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -407,8 +407,8 @@ function requireNative() { try { const binding = require('edgecrab-linux-riscv64-musl') const bindingPackageVersion = require('edgecrab-linux-riscv64-musl/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -423,8 +423,8 @@ function requireNative() { try { const binding = require('edgecrab-linux-riscv64-gnu') const bindingPackageVersion = require('edgecrab-linux-riscv64-gnu/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -440,8 +440,8 @@ function requireNative() { try { const binding = require('edgecrab-linux-ppc64-gnu') const bindingPackageVersion = require('edgecrab-linux-ppc64-gnu/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -456,8 +456,8 @@ function requireNative() { try { const binding = require('edgecrab-linux-s390x-gnu') const bindingPackageVersion = require('edgecrab-linux-s390x-gnu/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -476,8 +476,8 @@ function requireNative() { try { const binding = require('edgecrab-openharmony-arm64') const bindingPackageVersion = require('edgecrab-openharmony-arm64/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -492,8 +492,8 @@ function requireNative() { try { const binding = require('edgecrab-openharmony-x64') const bindingPackageVersion = require('edgecrab-openharmony-x64/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -508,8 +508,8 @@ function requireNative() { try { const binding = require('edgecrab-openharmony-arm') const bindingPackageVersion = require('edgecrab-openharmony-arm/package.json').version - if (bindingPackageVersion !== '0.8.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.8.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.9.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.9.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { diff --git a/sdks/nodejs-native/package-lock.json b/sdks/nodejs-native/package-lock.json index 2ac6229..5bb6286 100644 --- a/sdks/nodejs-native/package-lock.json +++ b/sdks/nodejs-native/package-lock.json @@ -1,12 +1,12 @@ { "name": "edgecrab", - "version": "0.8.0", + "version": "0.9.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "edgecrab", - "version": "0.8.0", + "version": "0.9.0", "license": "Apache-2.0", "devDependencies": { "@napi-rs/cli": "^3.0.0", diff --git a/sdks/nodejs-native/package.json b/sdks/nodejs-native/package.json index 6623f73..ff8ab0e 100644 --- a/sdks/nodejs-native/package.json +++ b/sdks/nodejs-native/package.json @@ -1,6 +1,6 @@ { "name": "edgecrab", - "version": "0.8.0", + "version": "0.9.0", "description": "Native Node.js SDK for the EdgeCrab AI agent runtime", "main": "index.js", "types": "index.d.ts", diff --git a/sdks/npm-cli/package.json b/sdks/npm-cli/package.json index 447af06..f95d26f 100644 --- a/sdks/npm-cli/package.json +++ b/sdks/npm-cli/package.json @@ -1,6 +1,6 @@ { "name": "edgecrab-cli", - "version": "0.8.0", + "version": "0.9.0", "description": "EdgeCrab — Super Powerful Personal Assistant inspired by NousHermes and OpenClaw. Rust-native TUI, ReAct tool loop, multi-provider LLM.", "license": "MIT", "homepage": "https://www.edgecrab.com", diff --git a/sdks/pypi-cli/edgecrab_cli/_version.py b/sdks/pypi-cli/edgecrab_cli/_version.py index 777f190..3e2f46a 100644 --- a/sdks/pypi-cli/edgecrab_cli/_version.py +++ b/sdks/pypi-cli/edgecrab_cli/_version.py @@ -1 +1 @@ -__version__ = "0.8.0" +__version__ = "0.9.0" diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index 1a88b70..2985834 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "edgecrab" -version = "0.8.0" +version = "0.9.0" description = "EdgeCrab AI Agent SDK — build autonomous agents with Python" readme = "README.md" license = { text = "Apache-2.0" } diff --git a/sdks/wasm/package.json b/sdks/wasm/package.json index 363b3c9..2d0037b 100644 --- a/sdks/wasm/package.json +++ b/sdks/wasm/package.json @@ -1,6 +1,6 @@ { "name": "@edgecrab/wasm", - "version": "0.8.0", + "version": "0.9.0", "description": "WASM bindings for the EdgeCrab AI agent SDK — browser & edge runtime", "license": "MIT OR Apache-2.0", "repository": { diff --git a/site/src/content/docs/changelog.md b/site/src/content/docs/changelog.md index ff5f88c..d1b70cb 100644 --- a/site/src/content/docs/changelog.md +++ b/site/src/content/docs/changelog.md @@ -8,12 +8,36 @@ sidebar: All notable changes to EdgeCrab are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -Last updated: 2026-04-21 +Last updated: 2026-04-22 --- ## [Unreleased] +## [0.9.0] — 2026-04-22 + +### Added + +- **Shadow Judge completion oracle** — EdgeCrab can now run an opt-in secondary LLM verdict before accepting a run as complete, then inject a continuation hint when a model appears to stop one step too early. +- **`/shadow-judge` TUI control surface** — session-scoped on/off/toggle/status command plus a picker UI and status-bar intervention badge. +- **Structured `report_task_status` tool** — the model can now emit milestone and blocked-state signals with evidence and remaining steps for the harness to interpret. + +### Changed + +- **Completion and retry guards are stricter after tool activity** — repeated malformed tool retries are suppressed semantically, not only by identical payload fingerprint. +- **Tool results carry richer machine-readable metadata** — file writes and patches expose line counts, terminal results expose truncation, and searches expose pagination headers so the agent can continue intelligently. +- **Several tool schemas now align more closely with runtime behavior** — clearer required arguments and fewer misleading schema-level requirements reduce avoidable invalid-argument loops. + +### Verification + +| Check | Result | +|--------|--------| +| `./scripts/release-version.sh check` | **passed locally before cut** | +| `cargo test -p edgecrab-core --lib` | **passed locally before cut** | +| `cargo test -p edgecrab-tools --lib` | **passed locally before cut** | +| `cargo test --workspace` | **passed locally before cut** | +| `fnm exec --using v22.12.0 pnpm build` in `site/` | **passed locally before cut** | + ## [0.8.0] — 2026-04-21 ### Added diff --git a/site/src/content/docs/features/overview.md b/site/src/content/docs/features/overview.md index 392671f..70db520 100644 --- a/site/src/content/docs/features/overview.md +++ b/site/src/content/docs/features/overview.md @@ -25,6 +25,11 @@ reasons about a task, calls a tool, observes the result, then repeats. The loop runs up to `model.max_iterations` tool calls (default: 90) before stopping. +For weaker models or long tool-heavy runs, EdgeCrab can also enable an +opt-in **Shadow Judge** completion oracle. It performs a cheap secondary +LLM verdict before the loop accepts a final answer, helping catch +premature "I'm done" stops that still have missing sub-steps. + ### Ratatui TUI A full-featured terminal UI with: @@ -34,6 +39,7 @@ A full-featured terminal UI with: - Keyboard-driven interface - Customizable skins, symbols, and personality presets - **Mission Steering** (`Ctrl+S`) — inject hints, redirects, or stop signals into a running agent loop mid-turn +- **Shadow Judge controls** (`/shadow-judge`) — toggle the completion oracle per session and see intervention badges when it keeps the run alive See [TUI Interface](/features/tui/) and [Mission Steering](/features/steering/) diff --git a/site/src/content/docs/reference/configuration.md b/site/src/content/docs/reference/configuration.md index 980f0c6..623c547 100644 --- a/site/src/content/docs/reference/configuration.md +++ b/site/src/content/docs/reference/configuration.md @@ -229,6 +229,15 @@ display: update_check_interval_hours: 24 skin: "default" +# ── Completion Oracle ───────────────────────────────────────────────── +shadow_judge: + enabled: false # opt-in secondary completion verdict + model: ~ # null = auxiliary.model → main model fallback + max_per_session: 5 # guardrail against correction loops + confidence_threshold: 0.70 # below this, let the main loop finish normally + context_messages: 20 # tail message window sent to the judge (0 = all) + min_messages_before_enable: 4 # skip trivial one-shot sessions + # ── Privacy ──────────────────────────────────────────────────────────── privacy: redact_pii: false diff --git a/site/src/content/docs/reference/slash-commands.md b/site/src/content/docs/reference/slash-commands.md index 544c946..7db8fec 100644 --- a/site/src/content/docs/reference/slash-commands.md +++ b/site/src/content/docs/reference/slash-commands.md @@ -30,7 +30,7 @@ edgecrab slash btw "sanity-check this migration plan" Navigation /help /quit /clear /new /status /version Model /model /cheap_model /vision_model /image_model /moa /provider /reasoning /stream Session /session /sessions /retry /undo /stop /btw /history /save /export /title /resume /branch -Config /config /prompt /verbose /personality /statusbar /worktree /yolo +Config /config /prompt /verbose /personality /statusbar /worktree /yolo /shadow-judge Tools /tools /toolsets /mcp /reload-mcp /plugins Memory /memory /skills /profile /profiles Analysis /cost /usage /compress /insights @@ -110,6 +110,7 @@ MCP /mcp /mcp-token | `/log [open\|level ]` | Open the log browser, live-follow local log tails, or persist the default log level (alias: `/logs`) | | `/worktree [status\|on\|off\|toggle]` | Open the git-worktree status overlay or change the saved default for future launches (alias: `/w`) | | `/yolo [on\|off\|toggle\|status]` | Toggle session-scoped dangerous-command approval bypass | +| `/shadow-judge [on\|off\|toggle\|status]` | Toggle the session-scoped completion oracle that vetoes likely premature run termination (aliases: `/sj`, `/shadow_judge`) | --- diff --git a/specs/nova_issue/00-index.md b/specs/nova_issue/00-index.md new file mode 100644 index 0000000..f326632 --- /dev/null +++ b/specs/nova_issue/00-index.md @@ -0,0 +1,59 @@ +# Bedrock Nova Premature Stop — Specification Index + +> **Status:** Analysis + Implementation + Architectural Evolution +> **Scope:** edgecrab-core, edgequake-llm + +## Documents in this Series + +| # | File | Purpose | Status | +|---|------|---------|--------| +| 01 | [01-problem-analysis.md](01-problem-analysis.md) | Evidence chain from user symptom, runtime loop, and AWS docs | Complete | +| 02 | [02-first-principles.md](02-first-principles.md) | First-principle decomposition of why Nova stops early | Complete | +| 03 | [03-adr-completion-gate.md](03-adr-completion-gate.md) | ADR: heuristic fix (deferred-work phrase detection) | Accepted + Implemented | +| 04 | [04-implementation-plan.md](04-implementation-plan.md) | Concrete heuristic implementation and regression coverage | Complete | +| 05 | [05-verification-plan.md](05-verification-plan.md) | Heuristic validation steps and pass criteria | Complete | +| 06 | [06-shadow-judge-critique-and-concept.md](06-shadow-judge-critique-and-concept.md) | Why the heuristic is structurally wrong + first-principles shadow judge design | Complete | +| 07 | [07-adr-shadow-judge.md](07-adr-shadow-judge.md) | ADR: shadow judge — semantic LLM completion oracle | Proposed | +| 08 | [08-shadow-judge-implementation.md](08-shadow-judge-implementation.md) | Shadow judge Rust implementation plan (all files, API, tests) | Ready to implement | + +## Evolution of This Spec Set + +### Phase 1 — Nova Heuristic (docs 01–05, implemented) + +Amazon Bedrock Nova is allowed to return `stopReason = end_turn` after a tool +result even when the higher-level user task is still unfinished. EdgeCrab must +therefore distinguish **"model stopped generating"** from **"task is actually +complete"**. + +The initial fix (ADR 003, doc 03) added a phrase-matching heuristic in +`completion_assessor.rs` that detects deferred-work language after tool activity. +This was implemented and all 12 existing tests pass. + +### Phase 2 — Shadow Judge Architecture (docs 06–08, proposed) + +The heuristic is a syntactic fix for a semantic problem. Doc 06 demonstrates that +phrase-matching over LLM output cannot reliably classify task completion because: + +- Natural language has infinite surface forms (the vocabulary is always incomplete). +- The 240-character window is arbitrary and model-update-fragile. +- The heuristic has no concept of the original user intent. +- False positives block completed sessions unnecessarily on strong models. + +The **shadow judge** (ADR 007, doc 07) is the principled successor: a single +lightweight LLM classification call that reads the full conversation history and +returns a structured JSON verdict (`complete` / `incomplete` + confidence + reason ++ steering hint). + +Key properties: +- **Opt-in** (default: disabled). Zero impact on strong-model sessions. +- **Session-isolated**: never mutates `session.messages`; prompt cache preserved. +- **Near-zero cost**: ~$0.004/invocation due to Anthropic prompt cache reuse. +- **General**: applicable to all weak models, not just Nova. +- **Steering**: judge provides specific next-action hints, not generic "continue" nudges. + +## One-Line Summary + +Amazon Bedrock Nova uses `end_turn` as a turn signal, not a task signal; the +short-term fix is phrase-detection at the completion gate; the principled fix is +a lightweight LLM oracle that classifies task completion semantically over the +full conversation trajectory. \ No newline at end of file diff --git a/specs/nova_issue/01-problem-analysis.md b/specs/nova_issue/01-problem-analysis.md new file mode 100644 index 0000000..58d0bdb --- /dev/null +++ b/specs/nova_issue/01-problem-analysis.md @@ -0,0 +1,94 @@ +# Bedrock Nova Premature Stop — Problem Analysis + +Cross-ref: [00-index.md](00-index.md), [02-first-principles.md](02-first-principles.md), [03-adr-completion-gate.md](03-adr-completion-gate.md) + +## Symptom + +With `bedrock/amazon.nova-lite-v1:0`, EdgeCrab may: + +1. Execute one tool successfully. +2. Receive a non-empty assistant message such as "Now I'll create the full file...". +3. End the turn as if the request were complete. + +The user then has to type `Continue` manually to get the next tool call. + +## Runtime Evidence + +### AWS contract + +From the AWS Bedrock Converse docs: + +- `stopReason` explains **why the model stopped generating content**. +- Example final response uses `"stopReason": "end_turn"`. +- The docs do **not** say that `end_turn` means the overall user task is done. + +From the AWS Bedrock tool-use docs: + +- Client-side tool calling requires the application to continue the + conversation after a tool result. +- The official sequence is: + 1. Send user message plus tools. + 2. Receive assistant message with `toolUse` and `stopReason == tool_use`. + 3. Execute the tool. + 4. Append a user `toolResult` message. + 5. Call `converse` again. + +Conclusion: Bedrock gives a **turn-level** stop signal, not a **task-level** + completion signal. + +### Edgequake / Bedrock provider evidence + +`edgequake-llm/src/providers/bedrock.rs` already does the Bedrock-specific +message mapping correctly: + +- Assistant tool requests are preserved as `ContentBlock::ToolUse`. +- Tool results are replayed as a user-role `ContentBlock::ToolResult`. +- `StopReason::EndTurn` is mapped to `finish_reason = "stop"`. + +This means the provider is representing the Bedrock contract faithfully. + +### EdgeCrab loop evidence + +`edgecrab-core/src/conversation.rs` already preserves the required assistant +tool-call message before appending the tool result message. + +That rules out the most obvious Bedrock protocol bug. + +### Actual harness gap + +`edgecrab-core/src/completion_assessor.rs` currently marks a run `Completed` +when all of the following are true: + +- the run is not interrupted, +- no clarification or approval is pending, +- there are no active TODO markers, +- the final response is non-empty, +- verification debt is not detected. + +There is currently no heuristic for this case: + +> The assistant returned **future-tense / deferred-work text** immediately after +> tool activity, which means the model narrated its next action instead of +> actually taking it. + +That is exactly the failure mode visible with Nova-lite. + +## Root Cause + +The root cause is **not** Bedrock message replay. + +The root cause is that EdgeCrab currently conflates: + +- **non-empty assistant text**, and +- **task completion**. + +For stronger models this often works accidentally. For Nova-lite it is unsafe, +because the model can legally end a Bedrock turn with interim narration such as +"Now I'll do X". + +## Controlling Hypothesis + +If EdgeCrab marks responses containing clear deferred-work language as +`Incomplete` when they appear after recent tool activity, then the existing +auto-continue path in `conversation.rs` will keep the loop running without any +manual `Continue` from the user. \ No newline at end of file diff --git a/specs/nova_issue/02-first-principles.md b/specs/nova_issue/02-first-principles.md new file mode 100644 index 0000000..ad47d4c --- /dev/null +++ b/specs/nova_issue/02-first-principles.md @@ -0,0 +1,66 @@ +# Bedrock Nova Premature Stop — First Principles + +Cross-ref: [01-problem-analysis.md](01-problem-analysis.md), [03-adr-completion-gate.md](03-adr-completion-gate.md) + +## First Principle 1: Provider stop != task complete + +`stopReason = end_turn` means only that the model stopped emitting tokens for +this API call. + +It does **not** prove that the user's requested outcome exists yet. + +## First Principle 2: Tool success != workflow success + +A successful `write`, `read_file`, or `terminal` call is only evidence that one +step ran. It is not evidence that the whole requested workflow is finished. + +Creating `./game2/` is not equivalent to creating the whole game. + +## First Principle 3: Narrated intent is a negative completion signal + +When the assistant says: + +- "Now I'll create..." +- "Let me write..." +- "Next I will run..." + +the assistant is explicitly stating that a required step still lies in the +future. + +That language is evidence of incompleteness, not completion. + +## First Principle 4: The safest place to fix this is the completion gate + +The Bedrock provider should continue to map AWS semantics faithfully. + +The conversation loop already has a generic recovery path for incomplete final +text: + +- `assess_completion(...)` returns `Incomplete`. +- `conversation.rs` injects a follow-up system nudge. +- the ReAct loop continues. + +Therefore the least invasive fix is to improve completion assessment, not to +invent Bedrock-specific tool replay logic or special-case the provider. + +## First Principle 5: Precision requires structural context + +Future-tense phrases alone are too broad. + +To avoid false positives, the heuristic should require both: + +1. recent tool activity, and +2. explicit deferred-work phrasing in the final assistant text. + +That keeps normal explanatory answers safe while catching Nova's premature +handoff behavior. + +## Design Invariants + +| # | Invariant | +|---|-----------| +| I1 | Preserve Bedrock provider semantics exactly | +| I2 | Preserve existing assistant `toolUse` + user `toolResult` replay | +| I3 | Classify narrated-next-step responses after tool activity as `Incomplete` | +| I4 | Do not require model-specific branches when a generic completion heuristic works | +| I5 | Add regression tests for both positive and negative cases | \ No newline at end of file diff --git a/specs/nova_issue/03-adr-completion-gate.md b/specs/nova_issue/03-adr-completion-gate.md new file mode 100644 index 0000000..39ccdaf --- /dev/null +++ b/specs/nova_issue/03-adr-completion-gate.md @@ -0,0 +1,63 @@ +# ADR 003: Completion Gate For Deferred-Work Responses + +Cross-ref: [01-problem-analysis.md](01-problem-analysis.md), [02-first-principles.md](02-first-principles.md), [04-implementation-plan.md](04-implementation-plan.md) + +## Status + +Accepted + +## Context + +Bedrock Nova can return a syntactically valid assistant message and stop the +turn even when the requested workflow is not complete. + +EdgeCrab already has: + +- correct Bedrock tool-use history replay, +- a generic auto-continue loop for `Incomplete` runs, +- a completion assessor that is currently too optimistic. + +## Decision + +Add a completion-assessment heuristic that marks a run `Incomplete` when: + +1. the final assistant text contains explicit deferred-work language, and +2. the recent message history shows tool activity. + +Examples of deferred-work language: + +- "Now I'll ..." +- "I will ... next" +- "Let me ..." +- "I’m going to ..." + +## Why this decision + +- It fixes the root cause at the decision boundary where the premature stop is + misclassified. +- It reuses the existing continuation machinery instead of adding another loop + or another provider retry path. +- It remains stable even if Bedrock adds more Nova variants, because the issue + is not the model ID itself but the harness interpretation of the returned text. + +## Rejected alternatives + +### 1. Bedrock-specific provider retry logic + +Rejected because the provider is already faithfully representing AWS semantics. + +### 2. Force `tool_choice = required` after every tool result + +Rejected because it is too blunt and can force unnecessary tools when the next +turn should legitimately answer in text. + +### 3. Require `report_task_status` for all completions + +Rejected because it would create a larger behavioral change across all models +and all tasks. + +## Consequences + +- Nova-lite no longer needs a manual `Continue` for this class of premature stop. +- Other weaker tool-using models benefit from the same safeguard. +- The heuristic must stay narrow to avoid false positives on genuine final text. \ No newline at end of file diff --git a/specs/nova_issue/04-implementation-plan.md b/specs/nova_issue/04-implementation-plan.md new file mode 100644 index 0000000..0f9bceb --- /dev/null +++ b/specs/nova_issue/04-implementation-plan.md @@ -0,0 +1,24 @@ +# Bedrock Nova Premature Stop — Implementation Plan + +Cross-ref: [03-adr-completion-gate.md](03-adr-completion-gate.md), [05-verification-plan.md](05-verification-plan.md) + +## Files + +- `crates/edgecrab-core/src/completion_assessor.rs` +- optionally `crates/edgecrab-core/src/conversation.rs` if follow-up wording needs tightening + +## Planned changes + +1. Add a helper that detects recent tool activity in the message history. +2. Add a helper that detects deferred-work language in the final assistant text. +3. In `assess_completion`, classify that combination as `Incomplete`. +4. Add unit tests for: + - a Nova-style premature "Now I'll ..." response after a tool result, + - a valid final answer after a tool result, + - future-tense text without recent tool activity. + +## Non-goals + +- No Bedrock protocol rewrite. +- No provider-specific retry loop. +- No change to the AWS Bedrock `StopReason` mapping. \ No newline at end of file diff --git a/specs/nova_issue/05-verification-plan.md b/specs/nova_issue/05-verification-plan.md new file mode 100644 index 0000000..a530514 --- /dev/null +++ b/specs/nova_issue/05-verification-plan.md @@ -0,0 +1,17 @@ +# Bedrock Nova Premature Stop — Verification Plan + +Cross-ref: [04-implementation-plan.md](04-implementation-plan.md) + +## Focused checks + +1. Unit tests for `completion_assessor.rs` pass. +2. The premature-stop regression test proves the outcome is `Incomplete`. +3. A normal post-tool final answer is still classified as `Completed`. +4. `cargo test -p edgecrab-core completion_assessor` passes. +5. If the touched slice compiles cleanly, run a narrow crate check for + `edgecrab-core`. + +## Pass criteria + +The fix is accepted when EdgeCrab's completion gate no longer treats +"Now I'll ..." style post-tool narration as a completed run. \ No newline at end of file diff --git a/specs/nova_issue/06-shadow-judge-critique-and-concept.md b/specs/nova_issue/06-shadow-judge-critique-and-concept.md new file mode 100644 index 0000000..5254d48 --- /dev/null +++ b/specs/nova_issue/06-shadow-judge-critique-and-concept.md @@ -0,0 +1,516 @@ +# Shadow Judge — Critique of the Heuristic and First-Principles Design + +Cross-ref: [00-index.md](00-index.md), [02-first-principles.md](02-first-principles.md), +[03-adr-completion-gate.md](03-adr-completion-gate.md), [07-adr-shadow-judge.md](07-adr-shadow-judge.md), +[08-shadow-judge-implementation.md](08-shadow-judge-implementation.md) + +--- + +## 1. Why the Heuristic Is Wrong — A Brutal Analysis + +The heuristic fix in `completion_assessor.rs` (ADR 003) scans the first 240 characters of +the final assistant message for phrases such as `"Let me"`, `"I will"`, `"I'm going to"`, +gated by a check that a tool result appeared in the last six messages. + +This is **syntactic classification over semantic content**. It solves a surface symptom +while leaving the root structural gap open. + +### 1.1 The Open-World Language Problem + +Natural language has infinitely many surface forms for the same underlying meaning. +The action-verb vocabulary in the heuristic is a finite, hand-curated list. +Any list curated today is already stale tomorrow because: + +- Models change (fine-tuning, RLHF, instruction tuning alters output phrasing). +- The current list contains 13 phrases. The English language has hundreds of ways + to signal deferred intent: `"I plan to"`, `"I intend to"`, `"I'll go ahead and"`, + `"I should now"`, `"the next step is"`, `"subsequently I will"`, etc. +- The list will drift further as Nova and other weak models are updated by AWS/OpenAI/Google. + +**Consequence:** Every model update can silently break coverage. The heuristic requires +permanent maintenance with no principled stopping condition. + +### 1.2 The 240-Character Window Fallacy + +The heuristic only looks at the first 240 characters of the final message. +This was chosen because Nova's deferred-work phrases typically appear early in the text. +But: + +- Nova's phrasing is not contractually stable. A different Nova instruction tune can + reorder the text. +- Other models (GPT-4o-mini, Mistral-small, Gemini-flash) can produce deferred-work + signals mid-response or late-response. +- 240 characters is not derived from any model contract or tokenizer analysis. + It is an empirical number from one observed Nova failure. + +**Consequence:** The heuristic misses any model that happens to produce deferred-work +language beyond the 240-character mark. + +### 1.3 The Context-Blindness Problem + +The heuristic has **no concept of what the original user task was**. + +Consider: the user asks `"Write me a poem about the ocean."` The model replies: +`"Let me write that poem for you:\n\nDeep calls to deep..."`. The heuristic fires +`has_deferred_work_signal` because "Let me" + "write" matches, and there was +a previous tool call (e.g., reading context from disk), so `has_recent_tool_activity` +is true. + +**Result: false positive.** The task is complete. The poem is in the response. The model +is correctly using "Let me write" as a rhetorical preamble to inline delivery, not as an +announcement of a future step. The heuristic incorrectly injects a continuation nudge. + +The heuristic cannot distinguish: +- "Let me write the file" → **deferred** (the file hasn't been written yet) +- "Let me write that poem for you: [poem]" → **not deferred** (the poem is the next token) + +Only a semantic reasoner that understands the context can make this distinction. + +### 1.4 The Precision-Recall Trade-off Has No Floor + +There is no principled way to tune the heuristic to be both precise and complete: + +- **If we narrow the vocabulary**: more false negatives (real deferral not caught). +- **If we widen the vocabulary**: more false positives (completion falsely blocked). + +The only way to achieve both is to understand the semantics of the conversation, which +the heuristic cannot do. + +### 1.5 The False-Positive Cost Is Non-Trivial + +A false positive causes the loop to continue when the task is done. This: + +- Wastes tokens on the follow-up nudge and the model's response to it. +- Can cause the model to re-do work it already completed. +- Creates a confusing user experience where the agent "keeps going" after finishing. +- Consumes iteration budget unnecessarily. + +For strong models (Claude Opus 4, GPT-4o), which are already self-consistent about +task completion, the heuristic is more likely to create false positives than to fix +anything real. + +### 1.6 The Adversarial Update Problem + +If a weak model is fine-tuned to work well with EdgeCrab, it will learn to avoid the +trigger phrases even when deferring work. The heuristic is not robust to model updates. +It is permanently one step behind model behavior with no principled termination condition. + +### 1.7 Summary: The Heuristic Violates First Principle 3 (Structural Context) + +From [02-first-principles.md](02-first-principles.md), First Principle 5: + +> Future-tense phrases alone are too broad. +> To avoid false positives, the heuristic should require both: +> 1. recent tool activity, and +> 2. explicit deferred-work phrasing in the final assistant text. + +The existing specs acknowledged this limitation. The heuristic is therefore the **minimum +viable patch**, not the permanent solution. This document establishes why, and what the +principled replacement is. + +--- + +## 2. First-Principles Derivation of Completion Detection + +### 2.1 The Fundamental Question + +A task is complete if and only if: + +> Every sub-goal of the original user request has been addressed, and the state of the +> world (files, processes, services, conversation) matches the intended final state. + +This definition has three key properties: + +1. It is **semantic** — it depends on understanding intent, not surface text. +2. It is **trajectory-global** — it requires reading the full conversation history, + not just the last message. +3. It is **evidence-requiring** — it requires that the state change is real, not just + announced. + +### 2.2 What Information Is Required to Detect Completion + +To evaluate whether a task is complete, a decision-maker needs: + +| Required Information | Available to Heuristic | Available to LLM Judge | +|----------------------|------------------------|------------------------| +| Original user intent | ❌ (no memory) | ✅ (read first message) | +| Sub-goals enumerated | ❌ | ✅ (read the whole trajectory) | +| Tool execution evidence | ✅ (checks last 6 msgs) | ✅ | +| Semantic meaning of final text | ❌ | ✅ | +| Relationship between sub-goals | ❌ | ✅ | +| Model-specific phrasing patterns | ❌ (requires tuning) | ✅ (naturally handles) | + +The heuristic satisfies 1/6. An LLM judge satisfies 6/6. + +### 2.3 The Self-Referential Justification + +An LLM is the right entity to judge LLM task completion because: + +1. **Same capability**: The judge can understand sub-goals that require LLM reasoning. +2. **Same language**: The judge reads the same natural language output that humans read. +3. **Different task**: The judge performs a narrow classification task (complete / not complete) + which is reliably elicitable via a constrained, targeted prompt, unlike the open-ended + generative task of the main agent. +4. **Verifiable**: The judge's output is structured JSON, not prose — machine-readable and + auditable. + +This is not circular: a "is this done?" query is provably narrower in scope than an +"execute this task" query. The classification error rate for well-designed judges over +structured trajectories is much lower than the false-positive/false-negative rate of +any static heuristic. + +### 2.4 First Principle: The Shadow Judge Invariants + +| # | Invariant | Rationale | +|---|-----------|-----------| +| SJ-1 | Shadow judge **never writes** to `session.messages` | Preserves session purity and Anthropic prompt cache | +| SJ-2 | Shadow judge is **read-only** on session history | Prevents state pollution of the main conversation | +| SJ-3 | Shadow judge can only **downgrade** (Completed → Incomplete), never upgrade | Veto-only semantics; avoids falsely ending blocked runs | +| SJ-4 | Shadow judge uses an **isolated provider call** with its own message list | Cache on the main session is untouched | +| SJ-5 | Shadow judge **reuses the cached stable system block** | Makes the shadow call nearly free on Anthropic | +| SJ-6 | Shadow judge is **opt-in** and configurable per model family | Avoids overhead on strong models that don't need it | +| SJ-7 | Shadow judge has a **per-session invocation limit** | Prevents infinite correction loops | +| SJ-8 | Shadow judge fires only when the synchronous assessor returns **Completed** | Short-circuits cleanly; avoids redundant calls | +| SJ-9 | Shadow judge output is **structured JSON** (`verdict`, `confidence`, `reason`, `steering_hint`) | Machine-readable, testable, auditable | +| SJ-10 | Shadow judge tokens are accounted in the session's **total token usage** | Cost transparency | + +--- + +## 3. The Shadow Judge Concept + +### 3.1 What It Is + +A *shadow judge* is a single, stateless LLM classification call that: + +1. Takes a snapshot of the current conversation history (read-only clone of + `session.messages`). +2. Prepends a compact system prompt: "You are a task-completion oracle. Output JSON." +3. Appends the original user goal and a structured classification question. +4. Makes one chat API call — **no tools, no streaming, no session persistence**. +5. Parses the JSON verdict. +6. If incomplete: constructs a targeted continuation message (steering hint) and injects + it into `session.messages` as a `Message::user(...)`. +7. If complete: does nothing. The main loop breaks normally. + +The judge's own API call and response are never added to `session.messages`. The session +is unmodified except for the optional steering hint if the task is incomplete. + +### 3.2 What It Is Not + +- **Not a child Agent**: `CoreSubAgentRunner` spawns a full `Agent` instance with its own + `execute_loop`, tool dispatch, session DB, todo store, etc. That is 10× more overhead. + The shadow judge is a single `provider.chat_with_tools()` call with no tool dispatch. +- **Not compression**: `compress_with_llm` reshapes the session history. The shadow judge + reads it but does not reshape it. +- **Not learning reflection**: `run_learning_reflection_bg` fires after session completion + to persist skills/memories. The shadow judge fires *before* completion to veto premature + exits. + +### 3.3 Judge Prompt Design + +The judge prompt is designed to: + +1. Be minimal (~120 tokens) so it doesn't pollute the context or cost much. +2. Be precise about the output format. +3. Err toward "incomplete" (strict judge) to avoid missing real gaps. + +``` +System (shadow judge only): + You are a task-completion oracle. Your only output MUST be a JSON object. + No prose. No explanation outside the JSON. + Schema: {"verdict":"complete"|"incomplete","confidence":0.0-1.0,"reason":"<1 sentence>","steering_hint":""} + Rules: + - "complete" means ALL parts of the user's original request are DONE with evidence. + - If the agent described a future action but has not taken it yet, output "incomplete". + - If any sub-task is missing evidence, output "incomplete". + - Be strict. When uncertain, prefer "incomplete". +``` + +Final message appended to the judge's message list (not the main session): + +``` +[shadow-judge query] +Original user request: +Has this request been fully completed? Check all sub-goals against the conversation history. +Output JSON verdict. +``` + +### 3.4 Token Economics (Anthropic) + +Anthropic pricing (claude-haiku-4 family as reference judge model): +- Cache read: $0.30 / MTok +- Cache write: $3.75 / MTok +- Output: $1.25 / MTok (haiku) vs $15 / MTok (Opus) + +**For a 10k-token conversation (typical), with prompt caching active:** + +| Component | Tokens | Rate | Cost | +|-----------|--------|------|------| +| Stable system block (cached) | 2,000 | $0.30/MTok cache read | $0.0006 | +| Conversation history (cached) | 10,000 | $0.30/MTok cache read | $0.003 | +| Judge prompt (new) | 120 | $3.75/MTok cache write | $0.00045 | +| Verdict output | 60 | $1.25/MTok output | $0.000075 | +| **Total shadow call** | **12,180** | — | **~$0.004** | + +A typical Opus-4 session runs $0.50–$2.00. The shadow call adds **<0.8%** to session cost. +With a cheap judge model (haiku-4, flash-3, mini), this is negligible. + +**If the judge is set to the same model as the main agent (e.g., claude-opus-4):** + +| Component | Tokens | Rate | Cost | +|-----------|--------|------|------| +| Conversation history (cached) | 10,000 | $0.30/MTok cache read | $0.003 | +| Verdict output | 60 | $15/MTok output | $0.0009 | +| **Total shadow call** | — | — | **~$0.004** | + +Even with Opus as judge, the cost is trivially small because output is tiny and input is cached. + +**Latency:** One extra API round-trip to a fast model (haiku, flash) adds ~300–800ms. +With Opus, ~1–3s. Acceptable given the Nova failure mode is 30+ seconds of manual retries. + +### 3.5 Cache Isolation Guarantee + +The Anthropic prompt cache is keyed on the prefix of the message list, including +`cache_control: ephemeral` markers. + +**Main session's cache is safe because:** + +1. The shadow judge calls `provider.chat_with_tools()` on a **cloned** message list, built + independently, never mutating `session.messages`. +2. The judge's call is a separate HTTP request with its own message array. +3. Anthropic's cache is server-side: the cache key depends on the content of the messages + array you send. The judge's call is a distinct request and creates or hits its own + cache entries. +4. The judge's call does NOT write `cache_control` markers into `session.messages`. + +**There is no shared mutable state between the main session's prompt cache and the +shadow call. They are two separate HTTP requests.** + +The one nuance: if the main session's conversation history is cached (as it will be for +messages up to the last `cache_control` breakpoint), the shadow call that reads the same +history will HIT that cache. This is the desired behavior — cheap cache reads, not +expensive re-processing. + +--- + +## 4. Edge Cases + +### 4.1 Shadow Judge Returns "Incomplete" Spuriously + +**Scenario:** Judge confidently says "incomplete" but the task was actually done. + +**Mitigation:** +- `max_per_session` limit (default: 5) caps correction loops. +- `confidence_threshold` (default: 0.70) — if confidence < threshold, treat as "complete" + to avoid spurious loops. +- The synchronous assessor already handles structural signals (todo list, clarify markers). + The judge is only called when those pass. If the judge fires 5 times and the task isn't + progressing, something else is wrong. + +### 4.2 Weak Model Used as Judge + +**Scenario:** The shadow judge is routed to the same weak model that's causing the +problem (e.g., Nova-lite as both main agent and judge). + +**Mitigation:** +- `shadow_judge.model` should default to `auxiliary.model`, which should be a stronger/ + different model. If the user is running Nova-lite + Nova-lite (judge = main), the judge + may also be unreliable. +- Document this: shadow judge works best when `model` ≠ main agent model. +- Future work: model-family-aware default fallback (e.g., auto-select claude-haiku-4 when + main model is `amazon.nova-*`). + +### 4.3 Shadow Judge Itself Fails (Network, Auth, Timeout) + +**Scenario:** The shadow judge API call throws an error. + +**Mitigation:** +- Judge failure is non-fatal: `run_shadow_judge()` returns `Option`. +- `None` is treated the same as `complete` verdict (conservatively fall back to + synchronous assessor's judgment). +- Error is logged at `tracing::warn!` level for observability. + +### 4.4 Infinite Correction Loop + +**Scenario:** Judge returns "incomplete" → agent responds → judge again returns "incomplete" +→ repeat forever. + +**Mitigation:** +- `max_per_session: 5` (hard cap). After 5 invocations, the shadow judge is skipped for + the rest of the session. +- The main loop already has `max_iterations` as a hard cap (default: 90). +- If after 5 judge invocations the task still isn't completing, the agent has a deeper + problem that the shadow judge cannot fix. Log and let the session complete normally. + +### 4.5 Prompt Cache Miss on Judge Call (First Turn) + +**Scenario:** First invocation of the session — nothing is cached yet. + +**Behavior:** Normal cache write cost (same as the main conversation turn). The cache +investment pays for itself on future turns. This is identical to any other first-turn +Anthropic call. + +### 4.6 Very Short Sessions (< 3 Messages) + +**Scenario:** User asks a one-shot question, agent responds with no tools, shadow judge +fires. + +**Mitigation:** +- SJ-8: Shadow judge only fires when the synchronous assessor returns `Completed`. +- For tool-free sessions, the synchronous assessor correctly returns `Completed`. +- The shadow judge should additionally skip when `session.messages.len() < MINIMUM_MESSAGES` + (suggested: 4). A one-shot Q&A doesn't need a judge. + +### 4.7 Token Budget and `max_per_session` Interaction + +**Scenario:** Session has used 88 out of 90 max iterations. Shadow judge fires and returns +"incomplete", injecting a continuation nudge. The loop continues but hits the hard cap at +iteration 90. + +**Behavior:** Correct. The loop exits due to `budget_exhausted = true`, which the +synchronous assessor already marks as `Incomplete` / `Failed`. The shadow judge doesn't +change this — it only fires when the synchronous assessor has already said "Completed." +If the session is nearly out of budget, the synchronous assessor returns `Failed` due +to budget exhaustion, and the shadow judge never fires (SJ-8). + +### 4.8 Streaming vs Non-Streaming + +**Scenario:** Shadow judge is called from a streaming main session. + +**Behavior:** The shadow judge call is always non-streaming (a single `chat_with_tools` +call awaited for its structured JSON response). The streaming session of the main loop +is unaffected because the shadow judge runs in the `LoopAction::Done` branch, which is +reached only after streaming has completed for that turn. + +### 4.9 Session `skip_memory` and `skip_context_files` Flags + +**Scenario:** Session has `skip_memory = true`. + +**Behavior:** The shadow judge is passed only the conversation history snapshot (no memory +or context files). The judge's system prompt is self-contained. The skip flags on the main +agent config do NOT apply to the shadow judge's internal prompt because the judge has its +own dedicated system prompt. + +### 4.10 Anthropic Prompt Caching and the Dual System Block + +The main session uses a stable/dynamic system block split (see `build_chat_messages_blocks` +in `conversation.rs`). The stable block has `cache_control: ephemeral`. + +The shadow judge builds its own message list: +1. A single judge system prompt (no stable/dynamic split needed). +2. The conversation history clone from `session.messages`. +3. The judge query message. + +The shadow judge does NOT set `cache_control` on any messages. This is intentional: +- The conversation history is already cached by the main session's breakpoints. +- A shadow call that reads the same cached content will hit those existing cache entries. +- Setting cache_control on the judge's messages would create new cache entries that + overlap with the main session's entries, wasting cache write budget. + +**Correct behavior:** Shadow call's message list hits the existing cache entries created +by the main session's `apply_cache_control` calls. No `cache_control` needed on the judge +call itself. + +--- + +## 5. Scope Beyond Nova + +The shadow judge is not a Nova-specific patch. It is a general solution to the structural +problem described in [02-first-principles.md](02-first-principles.md): + +> The safest place to fix this is the completion gate. + +The shadow judge upgrades the completion gate from a syntactic filter to a semantic +reasoner. Beneficiaries beyond Nova-lite: + +| Scenario | How Shadow Judge Helps | +|----------|------------------------| +| GPT-4o-mini declares completion after first step | Judge identifies missing sub-goals | +| Claude Haiku halts after scaffold with "the rest is left as an exercise" | Judge detects incomplete delivery | +| Gemini Flash-lite narrates future steps in Markdown format | Judge parses intent vs. evidence | +| Any weak model that fails `report_task_status` | Judge provides semantic coverage | +| Complex multi-file tasks where agent misses one file | Judge notices sub-goal gap | +| Long sessions where agent loses track of original goal | Judge re-anchors to first user message | + +The shadow judge also enables **proactive steering**: the `steering_hint` field in the +verdict allows the judge to specify exactly what action the agent should take next, +rather than the generic "do not stop yet" message from `build_completion_follow_up_message`. + +--- + +## 6. Relationship to Existing Heuristic (ADR 003) + +The shadow judge is a **successor** to ADR 003's heuristic, not a replacement at the same +level of implementation complexity. The layered architecture is: + +``` +Layer 1 (synchronous, free): + DefaultCompletionPolicy::assess() + ├── Active todos → Incomplete + ├── Clarify/approval pending → Incomplete + ├── has_remaining_steps → Incomplete + ├── [ADR 003] deferred_work heuristic → Incomplete ← current state + └── else → Completed + +Layer 2 (async, cheap LLM call): + run_shadow_judge() ← proposed + ├── verdict="incomplete", confidence ≥ threshold → Incomplete + steering_hint + └── verdict="complete" or error → Completed (pass through) +``` + +Layer 1 remains in place. It handles structural signals cheaply and is correct for the +easy cases. Layer 2 only fires when Layer 1 passes, providing semantic coverage for the +cases Layer 1 cannot handle. + +**Should ADR 003's heuristic be removed once the shadow judge is implemented?** + +Not immediately. The heuristic provides zero-cost coverage for the specific Nova pattern. +It should be retained as a free fast-path guard. Long term, once the shadow judge is +battle-tested, the deferred-work heuristic can be narrowed or removed. This is tracked +in the implementation plan. + +--- + +## 7. Activation and Configuration Philosophy + +### 7.1 Default Off + +Shadow judge is off by default. Most sessions on strong models do not need it. Adding +latency and cost without user consent is poor defaults discipline. + +### 7.2 Auto-Suggest for Known Weak Models + +The `model_catalog.yaml` or `AgentBuilder` can carry a `suggest_shadow_judge: true` flag +for known weak models. The CLI / setup wizard surfaces a suggestion: "For best results +with amazon.nova-lite-v1:0, consider enabling shadow_judge in your config." + +This is a suggestion, not auto-enable. The user controls costs. + +### 7.3 Per-Session Override + +Config key `shadow_judge.enabled` is read from the layered config (default → disk → +env → CLI). A per-session override is supported via a flag or CLI argument, consistent +with the existing config override pattern. + +### 7.4 Model Routing + +`shadow_judge.model` defaults to `auxiliary.model`. If `auxiliary.model` is not set, +falls back to the main agent model. The recommended configuration is: + +```yaml +auxiliary: + model: "anthropic/claude-haiku-4-20250514" + provider: "anthropic" + +shadow_judge: + enabled: true + # model: null → inherits auxiliary.model = claude-haiku-4 +``` + +This routes all side-task LLM calls (compression, shadow judge) to a cheap fast model. + +--- + +_See [07-adr-shadow-judge.md](07-adr-shadow-judge.md) for the decision record and +[08-shadow-judge-implementation.md](08-shadow-judge-implementation.md) for the +concrete implementation plan._ diff --git a/specs/nova_issue/07-adr-shadow-judge.md b/specs/nova_issue/07-adr-shadow-judge.md new file mode 100644 index 0000000..72eafde --- /dev/null +++ b/specs/nova_issue/07-adr-shadow-judge.md @@ -0,0 +1,219 @@ +# ADR 007: Shadow Judge — Semantic Completion Oracle + +Cross-ref: [06-shadow-judge-critique-and-concept.md](06-shadow-judge-critique-and-concept.md), +[08-shadow-judge-implementation.md](08-shadow-judge-implementation.md), +[02-first-principles.md](02-first-principles.md), [03-adr-completion-gate.md](03-adr-completion-gate.md) + +--- + +## Status + +**Proposed** + +--- + +## Context + +[ADR 003](03-adr-completion-gate.md) introduced a phrase-matching heuristic in +`completion_assessor.rs` to prevent premature session exits on Bedrock Nova-lite. + +That fix is correct for the narrow Nova pattern but violates the first principles +established in [02-first-principles.md](02-first-principles.md): + +> First Principle 3: Narrated intent is a negative completion signal. + +The heuristic detects *surface syntax* (future-tense phrases) as a proxy for narrated +intent. As documented in [06-shadow-judge-critique-and-concept.md](06-shadow-judge-critique-and-concept.md), +this proxy is: + +1. **Not complete** — finite vocabulary cannot cover all surface forms. +2. **Not context-aware** — cannot distinguish delivery from announcement. +3. **Not stable** — breaks silently with model updates. +4. **Not generalizable** — narrow enough to avoid false positives on Nova, but unable to + detect task incompleteness in any model that phrases deferral differently. + +A principled completion gate requires **semantic understanding of the full conversation +trajectory**, not phrase matching over the final message. + +The existing architecture already provides the necessary primitives: + +| Primitive | Location | Shadow Judge Use | +|-----------|----------|-----------------| +| `AuxiliaryConfig` | `config.rs` | Route shadow calls to a cheap model | +| `Arc` | `agent.rs` | Make the shadow API call | +| `SteeringReceiver` / `steer_tx` | `steering.rs` | Deliver the steering hint to the loop | +| `session.messages` snapshot (clone) | `conversation.rs` | Provide read-only history to judge | +| `LoopAction::Done(text)` branch | `conversation.rs` | Integration point: veto before break | + +--- + +## Decision + +Add a **Shadow Judge** — a single-call LLM completion oracle that: + +1. Fires only after `DefaultCompletionPolicy::assess()` returns `Completed`. +2. Sends a read-only snapshot of the conversation history to a lightweight LLM. +3. Returns a structured JSON verdict (`complete` / `incomplete` + confidence + reason + + optional steering hint). +4. On `incomplete`: injects the steering hint as a `Message::user(...)` into + `session.messages` and continues the loop. +5. On `complete` or error: does nothing. The loop breaks normally. + +The shadow judge is: +- **Opt-in** (default: disabled). +- **Configurable** via a new `shadow_judge` section in `config.yaml`. +- **Auto-suggested** for known weak models in the model catalog. +- **Cost-safe** due to Anthropic prompt cache reuse (~$0.004 per invocation). + +--- + +## Why This Decision + +### Why an LLM oracle and not a better heuristic + +The completion question is semantically complex. No finite static rule set can cover all +surface forms of "deferred intent" across all current and future models. An LLM oracle +operates at the same semantic level as the content it is evaluating — it reads intent, not +just syntax. The error rate of a well-prompted LLM judge for this narrow classification +task is substantially lower than the false-positive/false-negative rate of any heuristic +that is also required to avoid breaking strong models. + +See §1 and §2 of [06-shadow-judge-critique-and-concept.md](06-shadow-judge-critique-and-concept.md) +for the full technical argument. + +### Why not a child Agent (`CoreSubAgentRunner`) + +`CoreSubAgentRunner` creates a full `Agent` instance with its own `execute_loop`, tool +dispatch, session DB, todo store, and iteration budget. This is appropriate for delegated +sub-tasks. For a single classification query it is 100× more overhead than necessary. The +shadow judge requires only one `provider.chat_with_tools()` call with no tool dispatch. + +### Why not integrate into `CompletionPolicy` trait + +`CompletionPolicy::assess()` is synchronous and takes no `Arc`. Making +it async would require changing every call site in `conversation.rs` and would couple the +cheap synchronous assessment path to a potentially slow network call. The shadow judge +belongs at the call site in `conversation.rs` — after the synchronous assessor passes, as +an optional async veto. + +### Why not require `report_task_status` for all completions + +`report_task_status` requires the main agent to explicitly signal completion. This was +rejected in ADR 003 as a behavioral change across all models. The shadow judge is additive +and external — it does not change the protocol the main agent follows. + +### Why veto-only (downgrade, not upgrade) + +The shadow judge can observe that the task is *not* done by checking evidence gaps. It +cannot reliably certify that a task *is* done — completion is a stronger claim than +incompletion, and false positives (judge says "done" when not) would end the session +early. The conservative design is: the judge vetoes "complete" verdicts but never promotes +"incomplete" verdicts to "complete". + +### Why keep ADR 003's heuristic + +ADR 003's heuristic is a zero-cost first pass. It correctly handles the Nova specific +pattern with no latency and no API call. The shadow judge fires only when the heuristic +passes. Removing the heuristic would force a shadow API call on every completion decision, +including trivial cases that the heuristic handles cheaply. The layered architecture +(heuristic fast-path → LLM slow-path) minimizes cost and latency. + +--- + +## Consequences + +### Positive + +- Semantic completion gate applicable to all LLM models without per-model tuning. +- Steering hints are specific and actionable (judge knows what sub-goal is missing), + replacing the generic "do not stop yet" message. +- Near-zero token cost due to prompt cache reuse. +- Opt-in; zero impact on strong-model sessions that don't need it. +- Generalizable beyond Nova: benefits all weak models and complex multi-step tasks. +- `max_per_session` guard prevents infinite correction loops. + +### Negative + +- Adds ~300–1500ms latency per invocation on the completion branch. +- Requires a second LLM API credential / routing config if the judge model is different + from the main model (mitigated by sharing `AuxiliaryConfig`). +- Judge model quality affects verdict quality. An unreliable judge model (e.g., using + Nova-lite itself as judge) may be no better than the heuristic. +- Not a silver bullet for all forms of model misbehavior. The judge catches task + incompleteness; it cannot fix context exhaustion, tool failures, or model hallucination. + +--- + +## Rejected Alternatives + +### A. Improved heuristic with wider vocabulary + +Rejected. Adding more phrases to the heuristic is a patch, not a fix. The open-world +language problem ensures any finite vocabulary is always incomplete. See §1.1 of doc 06. + +### B. Model-specific `tool_choice = required` forcing + +Rejected. Too blunt. Forces tool calls even when a final text answer is the correct +response to the user. + +### C. Full AsyncCompletionPolicy trait + +Rejected. Would require all synchronous `CompletionPolicy::assess()` callers to become +async. Too much churn for a selectively-needed feature. + +### D. Background shadow call (fire-and-forget like learning reflection) + +Rejected. The shadow judge needs to veto the loop *before* `final_response = text; break` +is reached. A fire-and-forget call cannot block the loop. Background mode is for +post-session side effects (skills, memory). The judge is a pre-break gate. + +### E. Prompt-based instruction to the main model to call `report_task_status` + +Rejected. Strong models already self-report reliably. Weak models ignore or misuse the +instruction — this is the fundamental problem the shadow judge is designed to solve. + +--- + +## Design Constraints (Non-Negotiable) + +| # | Constraint | Origin | +|---|-----------|--------| +| C1 | Shadow call MUST NOT mutate `session.messages` | Prompt cache preservation | +| C2 | Shadow call MUST NOT rebuild the main session's system prompt | Cache invalidation risk | +| C3 | Shadow result MUST be structured JSON parseable without LLM retry | Reliability | +| C4 | Shadow judge MUST be skippable without code changes (config `enabled: false`) | User control | +| C5 | Shadow judge invocations per session MUST be bounded (`max_per_session`) | Loop safety | +| C6 | Shadow judge failure MUST be non-fatal (fall back to synchronous assessor) | Reliability | +| C7 | Shadow judge tokens MUST be accounted in session usage totals | Cost transparency | + +--- + +## Configuration Schema + +New top-level `config.yaml` section: + +```yaml +shadow_judge: + enabled: false # opt-in + model: null # null → use auxiliary.model → use main model + provider: null # null → use auxiliary.provider → use main provider + max_per_session: 5 # hard cap on invocations per session + confidence_threshold: 0.70 # below this confidence → treat as "complete" + context_messages: 20 # last N messages to send (0 = all) + min_messages_before_enable: 4 # skip judge for very short sessions +``` + +--- + +## Migration Path + +1. **Phase 1 (this ADR):** Implement `ShadowJudgeConfig`, `run_shadow_judge()`, and + integration point in `conversation.rs`. Deploy with `enabled: false` default. +2. **Phase 2:** Auto-suggest `enabled: true` in setup wizard for known weak models. + Update model catalog YAML with `suggest_shadow_judge: true` annotations. +3. **Phase 3 (long term):** Once shadow judge is battle-tested, narrow or remove the + ADR 003 deferred-work heuristic. The shadow judge subsumes its coverage. + +--- + +_Implementation details: [08-shadow-judge-implementation.md](08-shadow-judge-implementation.md)_ diff --git a/specs/nova_issue/08-shadow-judge-implementation.md b/specs/nova_issue/08-shadow-judge-implementation.md new file mode 100644 index 0000000..ec41e0f --- /dev/null +++ b/specs/nova_issue/08-shadow-judge-implementation.md @@ -0,0 +1,751 @@ +# Shadow Judge — Concrete Implementation Plan + +Cross-ref: [07-adr-shadow-judge.md](07-adr-shadow-judge.md), +[06-shadow-judge-critique-and-concept.md](06-shadow-judge-critique-and-concept.md) + +--- + +## Overview + +This document specifies every code change required to implement the shadow judge. +All design constraints from ADR 007 are enforced by the implementation. + +| Change | File | Status | +|--------|------|--------| +| New config struct `ShadowJudgeConfig` | `config.rs` | Not started | +| `shadow_judge` field in `AppConfig` | `config.rs` | Not started | +| `shadow_judge` field in `AgentConfig` | `agent.rs` | Not started | +| `to_agent_config` projection | `agent.rs` | Not started | +| New module `shadow_judge.rs` | `shadow_judge.rs` | Not started | +| `mod shadow_judge` declaration | `lib.rs` | Not started | +| Integration in `execute_loop` | `conversation.rs` | Not started | +| Unit tests | `shadow_judge.rs` | Not started | + +--- + +## 1. Config Changes (`crates/edgecrab-core/src/config.rs`) + +### 1.1 Add `ShadowJudgeConfig` struct + +Insert after the existing `AuxiliaryConfig` struct (≈ line 2001): + +```rust +/// Shadow judge configuration — lightweight LLM completion oracle. +/// +/// When enabled, the shadow judge fires after the synchronous +/// `DefaultCompletionPolicy` returns `Completed`. It makes a single +/// LLM classification call to verify that the original user request is +/// actually satisfied before allowing the loop to break. +/// +/// Default: all fields produce a safe disabled state. +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(default)] +pub struct ShadowJudgeConfig { + /// Enable shadow judge. Default: false (opt-in). + pub enabled: bool, + /// Judge model (e.g. "anthropic/claude-haiku-4-20250514"). + /// null → use auxiliary.model → use main model. + pub model: Option, + /// Judge provider name override. + /// null → inferred from model prefix. + pub provider: Option, + /// Hard cap on shadow judge invocations per session. + /// Prevents infinite correction loops. Default: 5. + pub max_per_session: u32, + /// If judge confidence < this threshold, treat verdict as "complete". + /// Range [0.0, 1.0]. Default: 0.70. + pub confidence_threshold: f32, + /// Number of most-recent messages to pass to the judge. + /// 0 = send all messages (caution: more tokens). Default: 20. + pub context_messages: usize, + /// Minimum conversation length before judge is eligible to fire. + /// Prevents single-turn Q&A sessions from being judged. Default: 4. + pub min_messages_before_enable: usize, +} + +impl Default for ShadowJudgeConfig { + fn default() -> Self { + Self { + enabled: false, + model: None, + provider: None, + max_per_session: 5, + confidence_threshold: 0.70, + context_messages: 20, + min_messages_before_enable: 4, + } + } +} +``` + +### 1.2 Add `shadow_judge` field to `AppConfig` + +In the `AppConfig` struct (≈ line 63), add after `auxiliary`: + +```rust + pub shadow_judge: ShadowJudgeConfig, +``` + +The `#[serde(default)]` on `AppConfig` ensures backward compatibility — existing +`config.yaml` files without a `shadow_judge` section load the `Default` (disabled). + +--- + +## 2. Agent Config Changes (`crates/edgecrab-core/src/agent.rs`) + +### 2.1 Add `shadow_judge` field to `AgentConfig` + +In `AgentConfig` struct (≈ line 132), add after the `auxiliary` field: + +```rust + /// Shadow judge configuration projected from AppConfig. + pub shadow_judge: crate::config::ShadowJudgeConfig, +``` + +### 2.2 Add default for `shadow_judge` in `AgentConfig::default()` + +In the `Default` impl for `AgentConfig`: + +```rust + shadow_judge: crate::config::ShadowJudgeConfig::default(), +``` + +### 2.3 Project `shadow_judge` in `AppConfig::to_agent_config()` + +In the `to_agent_config` method (the large projection block, ≈ line 370), add: + +```rust + shadow_judge: self.shadow_judge.clone(), +``` + +--- + +## 3. New Module: `shadow_judge.rs` + +Create `crates/edgecrab-core/src/shadow_judge.rs`: + +```rust +//! # Shadow Judge — Lightweight LLM Completion Oracle +//! +//! Fires after the synchronous `DefaultCompletionPolicy` returns `Completed`. +//! Makes a single isolated LLM classification call to verify that the original +//! user request is fully satisfied before the main loop breaks. +//! +//! ## Session isolation guarantee +//! +//! The shadow judge: +//! 1. CLONES `session.messages` into a read-only snapshot. +//! 2. Builds an independent `Vec` for the judge call. +//! 3. Appends the judge query to that independent list only. +//! 4. Makes one `provider.chat_with_tools()` call with an EMPTY tool list. +//! 5. Parses the structured JSON verdict. +//! +//! The main `session.messages` is NEVER mutated by `run_shadow_judge`. +//! Only the CALLER in `conversation.rs` mutates `session.messages` when it +//! injects the steering hint message — a deliberate, controlled write. +//! +//! ## Prompt cache behaviour +//! +//! No `cache_control` markers are written on the judge's message list. +//! Because the conversation history in `session.messages` was already marked +//! with `cache_control: ephemeral` breakpoints by `apply_cache_control` during +//! the main loop, those same breakpoints exist inside the judge's cloned slice. +//! The Anthropic server-side cache will hit those entries without any extra +//! `cache_control` from the judge call. Writing new breakpoints would be both +//! redundant and wasteful (cache-write tokens are 12× more expensive than +//! cache-read tokens on Anthropic). + +use std::sync::Arc; + +use edgecrab_types::Message; +use edgequake_llm::LLMProvider; + +use crate::config::ShadowJudgeConfig; +use crate::conversation::build_chat_messages; + +// ─── System prompt ──────────────────────────────────────────────────────────── + +const SHADOW_JUDGE_SYSTEM_PROMPT: &str = "\ +You are a task-completion oracle. Your ONLY output is a JSON object. No prose outside the JSON. + +Output schema: +{\"verdict\":\"complete\"|\"incomplete\",\"confidence\":0.0-1.0,\"reason\":\"\",\"steering_hint\":\"\"} + +Strict rules: +- \"complete\" means EVERY part of the user's original request is DONE with concrete evidence in the conversation. +- If the agent announced a future action but has not yet executed it, output \"incomplete\". +- If any explicitly requested sub-task is missing evidence of completion, output \"incomplete\". +- When uncertain, prefer \"incomplete\". +- Output ONLY the JSON object. No markdown fences. No commentary."; + +/// Final user message appended to the judge's isolated message list. +/// Never added to session.messages — used only inside `run_shadow_judge`. +const SHADOW_JUDGE_QUERY: &str = "\ +[shadow-judge query] +Review the entire conversation above. Has the agent's most recent response fully \ +completed the original user request? Check every sub-goal explicitly. If any sub-goal \ +was promised or implied but not yet evidenced with tool output or concrete file content, \ +output \"incomplete\". Output JSON verdict now."; + +// ─── Public types ───────────────────────────────────────────────────────────── + +/// Structured verdict returned by the shadow judge. +#[derive(Debug, Clone)] +pub struct ShadowVerdict { + /// True if the judge considers the task fully complete. + pub is_complete: bool, + /// Judge's confidence in its verdict. Range [0.0, 1.0]. + pub confidence: f32, + /// One-sentence reason for the verdict. + pub reason: String, + /// Specific next action the agent should take, if incomplete. + pub steering_hint: Option, + /// Input tokens consumed by the judge call (for session cost tracking). + pub input_tokens: u32, + /// Output tokens consumed by the judge call (for session cost tracking). + pub output_tokens: u32, +} + +// ─── Public API ─────────────────────────────────────────────────────────────── + +/// Run the shadow judge classification call. +/// +/// Returns `None` on API failure or JSON parse failure (non-fatal; caller +/// falls back to the synchronous assessor's verdict). +/// +/// Returns `Some(ShadowVerdict)` on success. The caller is responsible for +/// checking `verdict.is_complete` and `verdict.confidence` before acting. +/// +/// # Session Isolation +/// +/// This function NEVER writes to `messages`. It builds its own isolated +/// `Vec` from a slice of `messages` and the judge prompts. +/// The original `messages` slice is borrowed immutably for the duration of +/// this call and is unchanged on return. +pub async fn run_shadow_judge( + provider: &Arc, + model: &str, + messages: &[Message], + config: &ShadowJudgeConfig, +) -> Option { + // Respect minimum session length guard. + if messages.len() < config.min_messages_before_enable { + tracing::debug!( + msg_count = messages.len(), + min = config.min_messages_before_enable, + "shadow judge: skipping — session too short" + ); + return None; + } + + // Trim to last `context_messages` to control token cost. + // WHY: Long sessions have mostly-cached history; sending all is cheap. + // But for very long sessions (100+ messages) we bound to the recent tail. + let context_slice = if config.context_messages > 0 + && messages.len() > config.context_messages + { + &messages[messages.len() - config.context_messages..] + } else { + messages + }; + + // Build the judge's isolated message list. + // WHY `build_chat_messages` with `cache_config = None`: + // - We pass the judge's own system prompt (not the main session's). + // - `None` for cache_config → no `cache_control` annotations written. + // - Existing `cache_control` markers from the main session's + // `apply_cache_control` are preserved inside the slice because + // they are stored on the `Message` structs themselves. Anthropic + // will hit those existing cache entries. + let mut chat_messages = build_chat_messages( + Some(SHADOW_JUDGE_SYSTEM_PROMPT), + context_slice, + None, // No new cache breakpoints on the judge call + ); + + // Append judge query as the final user message. + // This is NEVER added to session.messages. + chat_messages.push(edgequake_llm::ChatMessage::user(SHADOW_JUDGE_QUERY)); + + // Make the LLM call — no tools, non-streaming, minimal output tokens. + // + // WHY `chat_with_tools` with empty tool list: The main provider API + // uses `chat_with_tools` uniformly. An empty tool list produces the + // same result as a plain `chat` call but reuses the existing call site. + let response = match provider + .chat_with_tools(&chat_messages, &[], Some(model), None) + .await + { + Ok(r) => r, + Err(e) => { + tracing::warn!( + error = %e, + model = model, + "shadow judge: API call failed (non-fatal, continuing with sync assessor verdict)" + ); + return None; + } + }; + + let raw_text = response.content.trim().to_string(); + let input_tokens = response.usage.as_ref().map_or(0, |u| u.input_tokens as u32); + let output_tokens = response.usage.as_ref().map_or(0, |u| u.output_tokens as u32); + + tracing::debug!( + raw = %raw_text, + input_tokens, + output_tokens, + "shadow judge: raw response" + ); + + parse_shadow_verdict(&raw_text, input_tokens, output_tokens) +} + +/// Resolve the provider and model to use for the shadow judge. +/// +/// Priority: +/// 1. `shadow_judge.model` (if set) — split on '/' for provider+model +/// 2. `auxiliary.model` (if set) — split on '/' for provider+model +/// 3. Fallback: caller-supplied `(main_provider, main_model)`. +/// +/// Returns `(provider, model_string)`. +pub fn resolve_shadow_provider_and_model( + shadow_cfg: &ShadowJudgeConfig, + auxiliary_model: Option<&str>, + main_provider: Arc, + main_model: &str, +) -> (Arc, String) { + // Try shadow_judge.model first, then auxiliary.model. + let candidate = shadow_cfg + .model + .as_deref() + .or(auxiliary_model) + .map(str::trim) + .filter(|s| !s.is_empty()); + + let Some(raw_model) = candidate else { + // No override — use main provider + model. + return (main_provider, main_model.to_string()); + }; + + // "provider/model" → create provider for that family. + if let Some((provider_name, _model_name)) = raw_model.split_once('/') { + let canonical = + edgecrab_tools::vision_models::normalize_provider_name(provider_name); + match edgecrab_tools::create_provider_for_model(&canonical, _model_name) { + Ok(p) => return (p, raw_model.to_string()), + Err(e) => { + tracing::warn!( + error = %e, + raw_model, + "shadow judge: failed to build configured provider, falling back to main provider" + ); + } + } + } + + // Bare model name — reuse main provider credentials. + (main_provider, raw_model.to_string()) +} + +// ─── Private helpers ───────────────────────────────────────────────────────── + +/// Parse the judge's JSON verdict from its response text. +/// +/// Returns `None` if the JSON is malformed or missing required fields. +/// WHY permissive parsing: the judge may wrap JSON in markdown fences despite +/// the system prompt forbidding it; strip them before parsing. +fn parse_shadow_verdict(text: &str, input_tokens: u32, output_tokens: u32) -> Option { + // Strip optional markdown code fences (e.g. ```json ... ```) + let cleaned = text + .trim() + .trim_start_matches("```json") + .trim_start_matches("```") + .trim_end_matches("```") + .trim(); + + // Find the JSON object boundaries in case there is leading/trailing text. + let start = cleaned.find('{')?; + let end = cleaned.rfind('}').map(|i| i + 1)?; + let json_slice = &cleaned[start..end]; + + let v: serde_json::Value = serde_json::from_str(json_slice).ok()?; + + let verdict_str = v["verdict"].as_str()?; + let is_complete = verdict_str == "complete"; + let confidence = v["confidence"].as_f64().unwrap_or(0.5) as f32; + let reason = v["reason"] + .as_str() + .unwrap_or("no reason provided") + .to_string(); + let steering_hint = v["steering_hint"] + .as_str() + .filter(|s| !s.is_empty() && *s != "null") + .map(str::to_string); + + Some(ShadowVerdict { + is_complete, + confidence, + reason, + steering_hint, + input_tokens, + output_tokens, + }) +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + // ─── parse_shadow_verdict tests ─────────────────────────────────────────── + + #[test] + fn parse_complete_verdict_ok() { + let json = r#"{"verdict":"complete","confidence":0.95,"reason":"All files created.","steering_hint":null}"#; + let v = parse_shadow_verdict(json, 100, 20).unwrap(); + assert!(v.is_complete); + assert!((v.confidence - 0.95).abs() < 0.01); + assert_eq!(v.reason, "All files created."); + assert!(v.steering_hint.is_none()); + } + + #[test] + fn parse_incomplete_verdict_with_hint() { + let json = r#"{"verdict":"incomplete","confidence":0.88,"reason":"CSS file missing.","steering_hint":"Create style.css with the game styles."}"#; + let v = parse_shadow_verdict(json, 200, 30).unwrap(); + assert!(!v.is_complete); + assert!((v.confidence - 0.88).abs() < 0.01); + assert!(v.steering_hint.is_some()); + assert!(v.steering_hint.unwrap().contains("style.css")); + } + + #[test] + fn parse_strips_markdown_fences() { + let json = "```json\n{\"verdict\":\"complete\",\"confidence\":0.9,\"reason\":\"done\",\"steering_hint\":null}\n```"; + let v = parse_shadow_verdict(json, 10, 5).unwrap(); + assert!(v.is_complete); + } + + #[test] + fn parse_invalid_json_returns_none() { + let bad = "This is not JSON at all."; + assert!(parse_shadow_verdict(bad, 0, 0).is_none()); + } + + #[test] + fn parse_missing_verdict_field_returns_none() { + let json = r#"{"confidence":0.9,"reason":"done","steering_hint":null}"#; + assert!(parse_shadow_verdict(json, 0, 0).is_none()); + } + + #[test] + fn parse_json_with_leading_prose() { + // Some models prepend a sentence despite the system prompt. + let json = r#"Here is my verdict: {"verdict":"incomplete","confidence":0.75,"reason":"JS missing.","steering_hint":"Write game.js."}"#; + let v = parse_shadow_verdict(json, 0, 0).unwrap(); + assert!(!v.is_complete); + } + + #[test] + fn parse_null_steering_hint_becomes_none() { + let json = r#"{"verdict":"complete","confidence":0.99,"reason":"All done.","steering_hint":"null"}"#; + let v = parse_shadow_verdict(json, 0, 0).unwrap(); + // "null" string should map to None + assert!(v.steering_hint.is_none()); + } + + #[test] + fn resolve_shadow_provider_and_model_no_override_uses_main() { + // When no shadow model and no auxiliary model are configured, the main + // provider and model are returned unchanged. + // (This test requires a mock provider; placeholder — integrate with + // MockLLMProvider when it is available in the test harness.) + let cfg = ShadowJudgeConfig::default(); + // cfg.model = None, no auxiliary_model + // Assertion: (provider, model) == (main_provider, main_model) + // Full integration test is in conversation.rs integration tests. + let _ = cfg; // suppress unused warning + } +} +``` + +--- + +## 4. Register Module (`crates/edgecrab-core/src/lib.rs`) + +Add the new module declaration after `pub mod sub_agent_runner;` (≈ line 23): + +```rust +pub mod shadow_judge; +``` + +--- + +## 5. Integration in `conversation.rs` + +### 5.1 Declare shadow judge counter before the loop + +In `execute_loop`, near the top of the function body (after `let config = ...`), add: + +```rust + // Shadow judge invocation counter — bounded by config.shadow_judge.max_per_session. + // WHY track here: The counter must survive across `continue` iterations. + // It is reset to 0 at loop start (each execute_loop call = one user turn). + let mut shadow_judge_invocations: u32 = 0; + let shadow_judge_cfg = config.shadow_judge.clone(); +``` + +### 5.2 Resolve shadow judge provider once before the loop + +After the `let effective_provider = ...` snapshot at loop start, add: + +```rust + // Resolve the shadow judge provider/model once per session. + // WHY pre-resolution: provider construction may involve env lookups and Arc + // allocation. We do it once and reuse inside the loop. + let (shadow_judge_provider, shadow_judge_model) = if shadow_judge_cfg.enabled { + crate::shadow_judge::resolve_shadow_provider_and_model( + &shadow_judge_cfg, + config.auxiliary.model.as_deref(), + Arc::clone(&effective_provider), + &config.model, + ) + } else { + // Placeholder — unused when shadow judge is disabled. + (Arc::clone(&effective_provider), config.model.clone()) + }; +``` + +### 5.3 Add shadow judge veto at `LoopAction::Done` branch + +The current code at the `LoopAction::Done(text)` branch (≈ line 1913): + +```rust + // (existing code) ... + if should_continue_after_model_text(&provisional_outcome) { + // ... inject follow-up and continue ... + } + + final_response = text; + break; +``` + +Replace the `final_response = text; break;` block with: + +```rust + // ── Shadow Judge veto ────────────────────────────────────────── + // Only fires when: + // 1. Shadow judge is enabled in config + // 2. The synchronous assessor says "Completed" + // (we are past the should_continue_after_model_text gate above) + // 3. Session is long enough (min_messages_before_enable) + // 4. We haven't hit the per-session invocation cap + // + // Constraint SJ-1: run_shadow_judge() NEVER mutates session.messages. + // Constraint SJ-3: only downgrade verdict (Completed → Incomplete). + if shadow_judge_cfg.enabled + && shadow_judge_invocations < shadow_judge_cfg.max_per_session + && session.messages.len() >= shadow_judge_cfg.min_messages_before_enable + { + if let Some(verdict) = crate::shadow_judge::run_shadow_judge( + &shadow_judge_provider, + &shadow_judge_model, + &session.messages, + &shadow_judge_cfg, + ) + .await + { + // Accumulate shadow judge tokens into session totals for + // cost tracking and usage display (SJ-10). + session.session_input_tokens += verdict.input_tokens as u64; + session.session_output_tokens += verdict.output_tokens as u64; + + if !verdict.is_complete + && verdict.confidence >= shadow_judge_cfg.confidence_threshold + { + shadow_judge_invocations += 1; + tracing::info!( + invocation = shadow_judge_invocations, + confidence = verdict.confidence, + reason = %verdict.reason, + has_hint = verdict.steering_hint.is_some(), + "shadow judge: task incomplete — continuing loop" + ); + session.messages.push(Message::user( + &build_shadow_judge_message(&verdict), + )); + self.publish_session_state(&session).await; + continue; + } else { + tracing::debug!( + confidence = verdict.confidence, + is_complete = verdict.is_complete, + threshold = shadow_judge_cfg.confidence_threshold, + "shadow judge: verdict is complete or below confidence threshold — proceeding to break" + ); + } + } + } + + final_response = text; + break; +``` + +### 5.4 Add `build_shadow_judge_message` helper function + +Add near `build_completion_follow_up_message` (≈ line 2600): + +```rust +/// Format a user-visible continuation message from a shadow judge verdict. +/// +/// WHY a separate function: mirrors `build_completion_follow_up_message` in +/// style and purpose. The shadow judge message is more specific — it carries +/// the judge's precise reason and steering hint — which is more useful to +/// the agent than the generic "do not stop yet" text. +fn build_shadow_judge_message(verdict: &crate::shadow_judge::ShadowVerdict) -> String { + let hint = verdict + .steering_hint + .as_deref() + .filter(|s| !s.is_empty()) + .unwrap_or("Continue working until the original request is fully complete."); + + format!( + "[shadow-judge: {}. {}]", + verdict.reason.trim_end_matches('.'), + hint + ) +} +``` + +--- + +## 6. Provider Resolution Contract + +The shadow judge provider is resolved via `resolve_shadow_provider_and_model()`: + +``` +Priority order: + shadow_judge.model (e.g. "anthropic/claude-haiku-4-20250514") + → split on '/' → canonical provider name + model name + → create_provider_for_model(canonical, model_name) + + If not set: auxiliary.model (same split logic) + + If not set: (main_provider, main_model) +``` + +This matches the pattern in `sub_agent_runner.rs::resolve_child_provider_and_model`. +The helper is `edgecrab_tools::create_provider_for_model` (already used by the +delegation runner). + +--- + +## 7. Token Accounting + +Session token usage is tracked in `SessionState`: +- `session.session_input_tokens: u64` +- `session.session_output_tokens: u64` + +The shadow judge increments these with `verdict.input_tokens` and +`verdict.output_tokens` (extracted from `response.usage`). This ensures: +- `/cost` and `/usage` slash commands reflect shadow judge overhead. +- The final `ConversationResult::usage` is accurate. + +--- + +## 8. Configuration YAML Example + +```yaml +# ~/.edgecrab/config.yaml + +# Route side-task calls to a cheap model +auxiliary: + model: "anthropic/claude-haiku-4-20250514" + provider: "anthropic" + +# Enable shadow judge for Nova-lite sessions +shadow_judge: + enabled: true + # model: null → inherits auxiliary.model = claude-haiku-4 + max_per_session: 5 + confidence_threshold: 0.70 + context_messages: 20 + min_messages_before_enable: 4 +``` + +--- + +## 9. Test Plan + +### Unit Tests (in `shadow_judge.rs`) + +| # | Test | Coverage | +|---|------|----------| +| T1 | `parse_complete_verdict_ok` | Happy path complete | +| T2 | `parse_incomplete_verdict_with_hint` | Happy path incomplete + hint | +| T3 | `parse_strips_markdown_fences` | JSON fence tolerance | +| T4 | `parse_invalid_json_returns_none` | Malformed input → None | +| T5 | `parse_missing_verdict_field_returns_none` | Partial JSON → None | +| T6 | `parse_json_with_leading_prose` | Prose-prefixed JSON extraction | +| T7 | `parse_null_steering_hint_becomes_none` | "null" string → Option::None | + +### Integration Tests (add to `conversation.rs` test section) + +| # | Test | Coverage | +|---|------|----------| +| I1 | `shadow_judge_disabled_loop_breaks_normally` | disabled=false → no extra turns | +| I2 | `shadow_judge_incomplete_verdict_continues_loop` | judge says incomplete → loop continues | +| I3 | `shadow_judge_complete_verdict_allows_break` | judge says complete → loop breaks | +| I4 | `shadow_judge_max_per_session_enforced` | after N invocations → judge skipped | +| I5 | `shadow_judge_api_error_is_nonfatal` | provider error → loop breaks normally | +| I6 | `shadow_judge_below_confidence_threshold_allows_break` | low confidence → treated as complete | +| I7 | `shadow_judge_short_session_skipped` | len < min_messages → judge not invoked | +| I8 | `shadow_judge_tokens_added_to_session_usage` | tokens accumulated correctly | + +### Token Cost Validation (manual) + +Run a Nova-lite session with shadow judge enabled. After the session, inspect `/cost` and +`/usage` output. Shadow judge overhead should appear in total tokens and be itemized in +session logs at `tracing::debug!` level. + +--- + +## 10. Migration and Rollout + +### Phase 1 — Implementation (this plan) + +- Implement all 8 changes above. +- Deploy with `shadow_judge.enabled: false` default. +- All existing sessions unaffected. + +### Phase 2 — Auto-Suggest for Weak Models + +Update `model_catalog_default.yaml` to add `suggest_shadow_judge: true` on: +- `amazon.nova-lite-v1:0` +- `amazon.nova-micro-v1:0` +- Any model with `context_window < 32768` (proxy for "capacity-limited model") + +The setup wizard reads this flag and suggests enabling shadow judge in `config.yaml` +when the user selects a flagged model. + +### Phase 3 — Heuristic Retirement (Long Term) + +Once shadow judge has accumulated 90 days of production use without regressions, +evaluate retiring the ADR 003 deferred-work heuristic in `completion_assessor.rs`. +The shadow judge's semantic coverage fully subsumes it. + +--- + +## 11. Design Constraints Verification + +| Constraint | How Satisfied | +|------------|--------------| +| C1: Shadow call MUST NOT mutate `session.messages` | `run_shadow_judge()` takes `&[Message]` (immutable borrow). Only the caller writes the steering hint. | +| C2: MUST NOT rebuild main session system prompt | The judge uses its own `SHADOW_JUDGE_SYSTEM_PROMPT`. The main `session.cached_system_prompt` is not touched. | +| C3: MUST produce structured JSON without LLM retry | `parse_shadow_verdict()` attempts JSON extraction with fence-stripping and object boundary search. Returns `None` on failure — caller falls back. | +| C4: MUST be skippable via config | `shadow_judge_cfg.enabled = false` short-circuits before any provider call. | +| C5: MUST bound invocations per session | `shadow_judge_invocations < shadow_judge_cfg.max_per_session` check before every call. | +| C6: MUST be non-fatal | All `provider.chat_with_tools()` errors return `None`; caller proceeds to `break` normally. | +| C7: MUST account tokens in session usage | `session.session_input_tokens` and `session.session_output_tokens` incremented with `verdict.input_tokens` and `verdict.output_tokens`. | diff --git a/specs/nova_issue/article.md b/specs/nova_issue/article.md new file mode 100644 index 0000000..66abd00 --- /dev/null +++ b/specs/nova_issue/article.md @@ -0,0 +1,337 @@ +# The Shadow Judge: When AI Agents Need a Second Opinion on Themselves + +*A general exploration of a practical pattern for keeping autonomous AI loops honest.* + +--- + +## WHY: The Agent That Thought It Was Done + +Picture an AI coding assistant running inside your IDE. You ask it to build a small browser game — HTML, CSS, JavaScript, three files. The agent calls its tools, reads the filesystem, writes a partial HTML stub, then emits this: + +> "I'll now create the CSS file with the game styles and wire up the event handlers." + +Then it stops. No CSS. No JavaScript. Session over. + +If you ask the agent whether it finished, it sincerely believes it did. The underlying model produced `end_turn` — the protocol signal for "I am done." The completion logic in the agent framework saw `end_turn`, found no pending tool calls, and broke the loop. From the framework's perspective, every rule was followed. + +The problem is that the agent narrated a future action as if announcing it were the same as executing it. The task is objectively incomplete. But no component in the standard ReAct loop caught it, because no component in that loop had enough context to know. + +This is not a bug in a single model. It is a **structural gap** in how reactive agent frameworks detect completion. + +--- + +## The Standard Completion Problem + +Most ReAct-style agent frameworks use one of three strategies to decide when to stop: + +``` +Standard Completion Decision Strategies +======================================== + + 1. Trust the model's stop signal + ───────────────────────────── + Model emits end_turn or stop reason → loop breaks. + Problem: end_turn is often wrong. + + 2. Detect explicit task marker + ──────────────────────────── + Wait for agent to call report_task_status("done") tool. + Problem: changes the protocol; not all models comply. + + 3. Phrase heuristic + ───────────────── + Scan final message for future-tense phrases ("I will", "Let me"). + If found → inject nudge; continue loop. + Problem: brittle, incomplete vocabulary, + no semantic understanding. +``` + +All three approaches share the same fundamental limitation: they operate on **surface signals** — stop codes, tool names, or vocabulary lists — not on the **semantic content** of the actual conversation. + +--- + +## Why Heuristics Are Not Enough + +A phrase-matching heuristic seems appealing at first. If the model says "I will write the file", keep going. Simple. + +But natural language has infinitely many surface forms for the same underlying meaning: + +``` +Deferred-intent phrases (partial list): + "I will..." "Let me..." + "I'll now..." "I should now..." + "I plan to..." "The next step is..." + "I intend to..." "Subsequently I will..." + "I'll go ahead and..." "I need to..." + ... and dozens more across languages and model fine-tunes +``` + +No finite list covers all of them. Every model update — fine-tuning, RLHF, instruction tuning — can change the phrasing without changing the underlying intent. A heuristic curated for one model version is silently stale for the next. + +Worse, heuristics cause **false positives**: the model genuinely finished the task but used a preamble like "Let me write that poem for you: [poem text follows]." The phrase fires, the loop continues unnecessarily, and the agent re-does work it already completed. + +The fundamental issue: a heuristic has no concept of what the original user task was. It cannot distinguish "I will write the file" (deferred) from "Let me write you that poem: [inline delivery]" (complete). Only semantic understanding of the full conversation trajectory can make that distinction. + +--- + +## First Principles: What Does "Done" Actually Mean? + +A task is complete if and only if: + +> Every sub-goal of the original user request has been addressed, **with evidence** that the state of the world matches the intended outcome. + +This definition has three important properties: + +``` +Properties of a Sound "Done" Test +=================================== + + 1. SEMANTIC + Depends on understanding intent, not surface text. + "I wrote the file" is only evidence if the tool call succeeded. + + 2. TRAJECTORY-GLOBAL + Requires reading the full conversation, not just the last message. + Sub-goal 1 may have been addressed 20 messages ago. + + 3. EVIDENCE-REQUIRING + A promise is not evidence. An announcement is not evidence. + Only tool output or verifiable artifact is evidence. +``` + +When you put these three requirements together, one thing becomes clear: the right entity to evaluate completion is something that understands **language**, **context**, and **evidence chains** — in short, an LLM. + +The entity best suited to verify that an LLM finished a task is another LLM. + +--- + +## Introducing the Shadow Judge + +A **Shadow Judge** is a single, stateless LLM classification call that fires *after* the primary agent signals completion, with one job: verify the claim. + +``` +Shadow Judge — Control Flow +============================ + + Main Agent Loop (ReAct) + ┌─────────────────────────────────────────────────────┐ + │ │ + │ User message │ + │ │ │ + │ ▼ │ + │ ┌──────────┐ tool call ┌──────────────────┐ │ + │ │ LLM │ ─────────────► │ Tool Executor │ │ + │ │ (main) │ ◄───────────── │ (file/terminal) │ │ + │ └──────────┘ tool result └──────────────────┘ │ + │ │ │ + │ │ end_turn / stop │ + │ ▼ │ + │ ┌──────────────────────────┐ │ + │ │ Synchronous Assessor │ fast, no API call │ + │ │ (heuristic pass 1) │ │ + │ └──────────┬───────────────┘ │ + │ │ "Completed" │ + │ ▼ │ + │ ┌──────────────────────────┐ │ + │ │ SHADOW JUDGE │ one LLM call │ + │ │ (semantic pass 2) │ │ + │ └──────────┬───────────────┘ │ + │ │ │ + │ ┌───────┴────────┐ │ + │ │ │ │ + │ complete incomplete │ + │ │ │ │ + │ ▼ ▼ │ + │ break loop inject steering hint │ + │ ─────────────────── │ + │ push nudge message │ + │ into session.messages │ + │ → loop continues │ + └─────────────────────────────────────────────────────┘ +``` + +The Shadow Judge is: + +- **A single chat API call** — no tools, no streaming, no session persistence. +- **Read-only** — it never writes to the main conversation's message history. +- **Veto-only** — it can downgrade "completed" to "incomplete" but can never upgrade "incomplete" to "completed." +- **Fast and cheap** — with a lightweight judge model and prompt caching, each invocation costs roughly $0.004. +- **Opt-in** — it does not activate unless explicitly configured. + +--- + +## Session Isolation: A Critical Design Constraint + +One of the most important properties of the Shadow Judge is that it is **invisible to the main conversation**. + +``` +Session Isolation Diagram +========================== + + session.messages (main loop, never mutated by judge) + ┌──────────────────────────────────────────────────┐ + │ [system] cached stable system prompt │ + │ [user] "build a browser game" │ + │ [asst] "I'll start with the HTML..." │ + │ [tool] write_file("index.html", ...) │ + │ [asst] "I'll now create style.css..." │ ← end_turn + └──────────────────────────────────────────────────┘ + │ + │ clone (read-only snapshot) + ▼ + shadow_messages (ephemeral, discarded after judge call) + ┌──────────────────────────────────────────────────┐ + │ [system] SHADOW_JUDGE_SYSTEM_PROMPT │ + │ [user] ... (cloned from session) ... │ + │ [asst] ... (cloned from session) ... │ + │ [user] SHADOW_JUDGE_QUERY (appended only │ + │ in this ephemeral list) │ + └──────────────────────────────────────────────────┘ + │ + │ provider.chat(shadow_messages) + ▼ + {"verdict":"incomplete","confidence":0.92, + "reason":"CSS and JS files not created.", + "steering_hint":"Write style.css and game.js."} + │ + │ verdict.is_complete == false + ▼ + session.messages.push( + Message::user("[system: do not stop. Create style.css and game.js.]") + ) + → main loop continues +``` + +The Anthropic prompt cache — keyed on the prefix of the main message list — is never invalidated. The judge's isolated HTTP request uses a separate message array. The main session's cached tokens are preserved, keeping cost low for both the judge call and all subsequent main loop calls. + +--- + +## The Verdict Schema + +The judge is prompted to output a single JSON object: + +```json +{ + "verdict": "complete" | "incomplete", + "confidence": 0.0 to 1.0, + "reason": "one sentence explaining the verdict", + "steering_hint": "specific next action if incomplete, or null" +} +``` + +No prose. No markdown fences. Structured and machine-readable. + +The `steering_hint` is the high-value field: instead of a generic "keep working" nudge, the judge tells the main agent *exactly* what is missing. "Create style.css with the game styles" is more actionable than "the task is not done." + +--- + +## The Invariants + +Ten invariants govern a correctly implemented Shadow Judge: + +``` +Shadow Judge Invariants +======================== + + SJ-1 Never writes to session.messages (read-only) + SJ-2 History is a clone; never the live reference + SJ-3 Veto-only: Completed → Incomplete (never reverse) + SJ-4 Isolated provider call; own message list + SJ-5 Reuses cached stable system block (cache HIT) + SJ-6 Opt-in and configurable per model family + SJ-7 Per-session invocation cap (prevents spirals) + SJ-8 Fires only when synchronous assessor passes + SJ-9 Output is structured JSON (auditable) + SJ-10 Token costs attributed to session total +``` + +--- + +## Cost and Latency Analysis + +The cost argument for the Shadow Judge is compelling precisely because it is not symmetric. The question it answers — "did the agent actually finish?" — is much narrower than the question the main agent answers. A small, cheap model can answer it reliably. + +``` +Per-Call Cost (10k-token session, claude-haiku-4 as judge) +============================================================ + + Component Tokens Rate Cost + ─────────────────────────────────────────────────────────── + Stable system (cached) 2,000 $0.30/MTok read $0.0006 + Conversation (cached) 10,000 $0.30/MTok read $0.003 + Judge prompt (new) 120 $3.75/MTok write $0.00045 + Verdict output 60 $1.25/MTok output $0.000075 + ─────────────────────────────────────────────────────────── + Total per invocation ≈ $0.004 + + Typical Opus-4 session cost: $0.50–$2.00 + Shadow judge overhead: < 0.8% of session cost +``` + +The latency cost is 300–800ms on a fast model. The alternative — a missed completion that requires the user to manually re-prompt, observe incomplete output, and restart — routinely costs 30+ seconds and complete loss of context. + +--- + +## Why Not a Child Agent? + +A reasonable question: why not spin up a child agent to verify completion? This would reuse the existing delegation infrastructure. + +The answer is cost and precision. A child agent — even a minimal one — comes with: + +- Its own `execute_loop` with tool dispatch +- A session database connection +- A todo store +- An iteration budget +- A separate streaming channel + +This is appropriate for a *task*. It is 100× over-engineered for a binary classification question. The Shadow Judge needs exactly one API call, one response, one JSON parse. Nothing else. + +--- + +## Where This Pattern Applies + +The Shadow Judge is not specific to any particular agent framework or model family. The pattern applies wherever: + +1. An agent loop breaks on a model-emitted stop signal. +2. The agent's task involves multiple sub-goals (not single-turn Q&A). +3. Completion matters — the loop should not exit early, nor run indefinitely. +4. Token cost is a constraint. + +The pattern is model-agnostic. It was motivated by the specific case of AWS Bedrock Nova-lite, which frequently emits `end_turn` with future-tense narration. But because it operates semantically, it generalizes: it catches the same failure mode regardless of which model exhibits it, and regardless of how that model phrases the deferred intent. + +The layered architecture is key: + +``` +Layered Completion Gate +======================== + + Layer 1: Synchronous heuristic + ──────────────────────────────── + Cost: 0 API calls + Latency: <1ms + Coverage: known patterns on known models + Correct when: the failure mode is predictable and narrow + ↓ passes "Completed" + + Layer 2: Shadow Judge (async LLM) + ──────────────────────────────────── + Cost: ~$0.004 per invocation + Latency: 300–800ms + Coverage: any model, any phrasing, any sub-goal structure + Correct when: the task has verifiable sub-goals + ↓ vetoes if evidence gaps found +``` + +Layer 1 handles the common, cheap case. Layer 2 catches what Layer 1 cannot. + +--- + +## Summary + +The Shadow Judge is a lightweight LLM verification step inserted at the moment an agent claims to be done. It does not change how the main agent works. It does not change the protocol the model follows. It adds one cheap read-only oracle call at the single point where the system is most likely to be wrong: the completion boundary. + +The central insight is that **the best entity to verify that an LLM finished a task is another LLM** — specifically one given a narrow, well-scoped classification task rather than an open-ended generative one. The result is semantic verification at heuristic cost. + +When the judge says "incomplete," the loop continues with a targeted hint. When the judge says "complete" — or errors out — the loop breaks normally. The system degrades gracefully in every failure mode. + +The pattern is simple enough to implement in a few hundred lines. The impact — measured in tasks that actually finish rather than stopping one step short — is substantial. diff --git a/specs/nova_issue/article_short.md b/specs/nova_issue/article_short.md new file mode 100644 index 0000000..dfb8122 --- /dev/null +++ b/specs/nova_issue/article_short.md @@ -0,0 +1,42 @@ +# The Shadow Judge: Teaching AI Agents to Doubt Themselves + +Your AI agent just said "I'll now create the CSS file." Then stopped. Task incomplete. No CSS. Session over. + +This is one of the most common silent failures in autonomous AI agents: the model emits a stop signal while narrating a future action as if announcing it were the same as executing it. The framework sees "stop," breaks the loop, and returns a half-finished result to the user. + +The naive fix is a phrase-matching heuristic — scan for "I will," "Let me," "I'm going to." Block the exit if those phrases appear. It works for a week, until the model updates and starts saying "I should now" or "The next step is." No finite vocabulary list can cover all surface forms of deferred intent. Every model update silently breaks coverage. + +**The real fix requires semantic understanding.** A task is complete only when every sub-goal has been addressed *with evidence* — tool output, created files, verified results. A phrase list has no concept of the original user goal. It can't distinguish "Let me write the file" (deferred) from "Let me write that poem for you: [poem follows]" (complete delivery). + +The right entity to verify that an LLM finished a task is **another LLM** — one given a narrow classification task rather than an open-ended generative one. + +``` +Main Agent says "done" + ↓ +Synchronous heuristic (fast, free) + ↓ passes +Shadow Judge — one LLM call + "Has every sub-goal been completed with evidence?" + ↓ ↓ + complete incomplete + ↓ ↓ + break loop inject targeted hint → loop continues +``` + +The Shadow Judge is a single, stateless chat API call. It takes a read-only snapshot of the conversation history, prepends a minimal system prompt ("you are a completion oracle, output JSON only"), and returns a structured verdict: `complete/incomplete`, a confidence score, and — critically — a specific `steering_hint` telling the main agent exactly what is missing. + +It **never writes to the main session**. It never invalidates the prompt cache. It reuses the cached conversation tokens, so on Anthropic the entire judge call costs ~$0.004 — less than 0.8% of a typical session. Latency: 300–800ms on a fast model. + +Three design constraints make this safe to deploy: + +**Veto-only semantics.** The judge can downgrade "complete" to "incomplete" but can never upgrade "incomplete" to "complete." This asymmetry prevents false terminations — the conservative direction is always to keep working. + +**Session isolation.** The judge's message list is an ephemeral clone. The main conversation is untouched. The Anthropic prompt cache is preserved for all subsequent main-loop calls. + +**Bounded by a per-session cap.** Maximum 5 invocations per session prevents infinite correction spirals if something else is wrong. + +The result: tasks that stopped one step short now finish. At heuristic cost. With semantic accuracy. + +--- + +*The Shadow Judge pattern was developed while debugging premature stop behavior on AWS Bedrock Nova-lite. It generalizes to any ReAct agent loop where completion matters and model stop signals cannot be fully trusted.*