diff --git a/benchmarks/dataset-agent/README.md b/benchmarks/dataset-agent/README.md index 57eded5..3321c3c 100644 --- a/benchmarks/dataset-agent/README.md +++ b/benchmarks/dataset-agent/README.md @@ -21,6 +21,25 @@ Real Mastra benchmark runs require `OPENROUTER_API_KEY` and `TINYFISH_API_KEY` loaded execution-only. If either is missing, the adapter returns a blocked benchmark result instead of touching app data. +## Run Collection Inside Self-Healing + +The collection adapter uses the same benchmark runner, but wraps +`CollectionPopulateRecipeRuntime` inside `SelfHealingPopulateRecipeService`. +That means collection results are scored after the same recipe generation, +repair, validation, and promotion path as the app runtime. + +```bash +node benchmarks/dataset-agent/run-benchmark.mjs \ + --prompt-ids latest-ai-blog-posts,saas-pricing-pages \ + --system collection-self-heal='node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs' +``` + +Real collection benchmark runs require `OPENROUTER_API_KEY`, +`TINYFISH_API_KEY`, and `BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE` loaded in +the shell. The runner module must export `runCollectionPopulatePipeline(input)` +or a default runner that accepts `CollectionPopulatePipelineInput` and returns a +`PopulateRuntimeResult`. + ## Verify Self-Healing Stack Use this before asking someone else to migrate a new collection agent into the @@ -30,9 +49,9 @@ app path: make verify-self-healing ``` -That command runs backend tests, backend build, adapter syntax checks, and a -no-key benchmark smoke that must produce a clean `blocked` result without -spending OpenRouter or TinyFish credits. +That command runs backend tests, backend build, adapter syntax checks, and +Mastra + collection no-key benchmark smokes that must produce clean `blocked` +results without spending OpenRouter or TinyFish credits. Live checks are explicit: diff --git a/benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs b/benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs new file mode 100644 index 0000000..06e4f0c --- /dev/null +++ b/benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs @@ -0,0 +1,163 @@ +#!/usr/bin/env node +import { pathToFileURL } from "node:url"; +import { resolve } from "node:path"; + +const prompt = requiredEnv("BIGSET_BENCHMARK_PROMPT"); +const promptId = process.env.BIGSET_BENCHMARK_PROMPT_ID ?? "benchmark-prompt"; +const promptQuality = process.env.BIGSET_BENCHMARK_PROMPT_QUALITY ?? "unknown"; +const requiredColumns = columnList( + requiredEnv("BIGSET_BENCHMARK_REQUIRED_COLUMNS") +); +const minimumRequiredColumns = columnList( + process.env.BIGSET_BENCHMARK_MINIMUM_REQUIRED_COLUMNS ?? "" +); + +const missingRuntimeKeys = ["OPENROUTER_API_KEY", "TINYFISH_API_KEY"].filter( + (name) => !process.env[name] +); +if (missingRuntimeKeys.length > 0) { + console.log(JSON.stringify({ + rows: [], + validationIssues: [ + `Missing ${missingRuntimeKeys.join(", ")} for collection self-healing benchmark.`, + ], + usage: emptyUsage(), + metrics: emptyMetrics(), + })); + process.exit(0); +} + +const collectionRunner = await loadCollectionRunner(); +if (!collectionRunner) { + console.log(JSON.stringify({ + rows: [], + validationIssues: [ + "Collection self-healing benchmark runner is not configured. Set BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE to a module exporting runCollectionPopulatePipeline(input).", + ], + usage: emptyUsage(), + metrics: emptyMetrics(), + })); + process.exit(0); +} + +const { + diagnosticRunForTick, + validationIssuesForSelfHealingTick, +} = await import( + "../../../backend/src/pipeline/populate-self-healing-runner.ts" +); +const { + DefaultPopulateRecipeAuthor, + InMemoryPopulateRecipeStore, + SelfHealingPopulateRecipeService, +} = await import( + "../../../backend/src/pipeline/populate-self-healing.ts" +); +const { + CollectionPopulateRecipeRuntime, +} = await import( + "../../../backend/src/pipeline/populate-collection-runtime.ts" +); + +const context = { + datasetId: `benchmark-${safeIdSegment(promptId)}`, + datasetName: `benchmark_${safeIdSegment(promptId)}`, + description: prompt, + columns: requiredColumns.map((columnName) => ({ + name: columnName, + type: inferPopulateColumnType(columnName), + description: `Benchmark requested column for ${promptQuality} prompt.`, + })), +}; +const service = new SelfHealingPopulateRecipeService({ + store: new InMemoryPopulateRecipeStore(), + runtime: new CollectionPopulateRecipeRuntime({ + runPipeline: collectionRunner, + targetRows: Number(process.env.BIGSET_COLLECTION_BENCHMARK_MAX_ROWS ?? "10"), + }), + author: new DefaultPopulateRecipeAuthor(), +}); +const tick = await service.tick({ datasetId: context.datasetId, context }); +const result = diagnosticRunForTick(tick); + +console.log(JSON.stringify({ + rows: result?.rows ?? [], + validationIssues: [ + ...validationIssuesForSelfHealingTick(tick), + ...minimumColumnIssues(result?.rows ?? []), + ], + usage: result?.usage ?? emptyUsage(), + metrics: result?.metrics ?? emptyMetrics(), +})); + +async function loadCollectionRunner() { + const moduleSpecifier = process.env.BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE; + if (!moduleSpecifier) { + return undefined; + } + const moduleUrl = moduleSpecifier.startsWith(".") || moduleSpecifier.startsWith("/") + ? pathToFileURL(resolve(moduleSpecifier)).href + : moduleSpecifier; + const loaded = await import(moduleUrl); + const runner = loaded.runCollectionPopulatePipeline ?? loaded.default; + if (typeof runner !== "function") { + throw new Error( + `${moduleSpecifier} must export runCollectionPopulatePipeline(input) or a default runner.` + ); + } + return runner; +} + +function minimumColumnIssues(rows) { + const issues = []; + for (const [rowIndex, row] of rows.entries()) { + for (const columnName of minimumRequiredColumns) { + const value = row.cells?.[columnName]; + if (value === undefined || value === null || value === "") { + issues.push(`Row ${rowIndex} missing minimum required column ${columnName}.`); + } + } + } + return issues; +} + +function inferPopulateColumnType(columnName) { + if (/(url|website|link|page)$/i.test(columnName)) return "url"; + if (/(date|_at)$/i.test(columnName)) return "date"; + if (/^(is_|has_|can_)/i.test(columnName)) return "boolean"; + if (/(count|price|amount|score|number|total)/i.test(columnName)) return "number"; + return "text"; +} + +function safeIdSegment(value) { + return String(value).replace(/[^a-zA-Z0-9._-]/g, "_").slice(0, 80); +} + +function columnList(value) { + return value + .split(",") + .map((columnName) => columnName.trim()) + .filter(Boolean); +} + +function emptyUsage() { + return { promptTokens: 0, completionTokens: 0, totalTokens: 0 }; +} + +function emptyMetrics() { + return { + searchCalls: 0, + fetchCalls: 0, + browserCalls: 0, + agentRuns: 0, + agentSteps: 0, + }; +} + +function requiredEnv(name) { + const value = process.env[name]; + if (!value) { + throw new Error(`Missing ${name}. Run through run-benchmark.mjs.`); + } + return value; +} diff --git a/scripts/verify-self-healing-stack.sh b/scripts/verify-self-healing-stack.sh index 58c4793..6e8eacf 100755 --- a/scripts/verify-self-healing-stack.sh +++ b/scripts/verify-self-healing-stack.sh @@ -90,18 +90,20 @@ check_convex_ready() { } run_blocked_benchmark_smoke() { - local out_dir="benchmark-results/self-healing-blocked-smoke-$(date +%Y%m%d-%H%M%S)" + local system_name="$1" + local system_command="$2" + local out_dir="benchmark-results/${system_name}-blocked-smoke-$(date +%Y%m%d-%H%M%S)" local stdout_file="${out_dir}/runner-stdout.json" mkdir -p "$out_dir" - printf 'RUN mastra benchmark no-key blocked smoke\n' + printf 'RUN %s benchmark no-key blocked smoke\n' "$system_name" if ! env -u OPENROUTER_API_KEY -u TINYFISH_API_KEY node benchmarks/dataset-agent/run-benchmark.mjs \ --prompt-ids latest-ai-blog-posts \ --timeout-ms 60000 \ --out "$out_dir" \ - --system "mastra=node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs" \ + --system "${system_name}=${system_command}" \ > "$stdout_file"; then - mark_fail "mastra benchmark no-key blocked smoke" + mark_fail "${system_name} benchmark no-key blocked smoke" return fi @@ -154,9 +156,9 @@ for (const result of summary.laneResults ?? []) { } } ' "${out_dir}/summary.json"; then - mark_pass "mastra benchmark no-key blocked smoke (${out_dir})" + mark_pass "${system_name} benchmark no-key blocked smoke (${out_dir})" else - mark_fail "mastra benchmark no-key blocked smoke" + mark_fail "${system_name} benchmark no-key blocked smoke" fi } @@ -253,10 +255,16 @@ if [[ "$SHOULD_RUN_LOCAL_GATES" -eq 1 ]]; then run_required_step "backend tests" npm --prefix backend test run_required_step "backend build" npm --prefix backend run build run_required_step "mastra adapter syntax" node --check benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs + run_required_step "collection adapter syntax" node --check benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs fi if [[ "$SHOULD_RUN_BLOCKED_BENCHMARK_SMOKE" -eq 1 ]]; then - run_blocked_benchmark_smoke + run_blocked_benchmark_smoke \ + "mastra" \ + "node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs" + run_blocked_benchmark_smoke \ + "collection-self-heal" \ + "node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs" fi if [[ "$SHOULD_RUN_CONVEX_PUSH" -eq 1 ]]; then