Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions benchmarks/dataset-agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,25 @@ Real Mastra benchmark runs require `OPENROUTER_API_KEY` and `TINYFISH_API_KEY`
loaded execution-only. If either is missing, the adapter returns a blocked
benchmark result instead of touching app data.

## Run Collection Inside Self-Healing

The collection adapter uses the same benchmark runner, but wraps
`CollectionPopulateRecipeRuntime` inside `SelfHealingPopulateRecipeService`.
That means collection results are scored after the same recipe generation,
repair, validation, and promotion path as the app runtime.

```bash
node benchmarks/dataset-agent/run-benchmark.mjs \
--prompt-ids latest-ai-blog-posts,saas-pricing-pages \
--system collection-self-heal='node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs'
```

Real collection benchmark runs require `OPENROUTER_API_KEY`,
`TINYFISH_API_KEY`, and `BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE` loaded in
the shell. The runner module must export `runCollectionPopulatePipeline(input)`
or a default runner that accepts `CollectionPopulatePipelineInput` and returns a
`PopulateRuntimeResult`.

## Verify Self-Healing Stack

Use this before asking someone else to migrate a new collection agent into the
Expand All @@ -30,9 +49,9 @@ app path:
make verify-self-healing
```

That command runs backend tests, backend build, adapter syntax checks, and a
no-key benchmark smoke that must produce a clean `blocked` result without
spending OpenRouter or TinyFish credits.
That command runs backend tests, backend build, adapter syntax checks, and
Mastra + collection no-key benchmark smokes that must produce clean `blocked`
results without spending OpenRouter or TinyFish credits.

Live checks are explicit:

Expand Down
163 changes: 163 additions & 0 deletions benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
#!/usr/bin/env node
import { pathToFileURL } from "node:url";
import { resolve } from "node:path";

const prompt = requiredEnv("BIGSET_BENCHMARK_PROMPT");
const promptId = process.env.BIGSET_BENCHMARK_PROMPT_ID ?? "benchmark-prompt";
const promptQuality = process.env.BIGSET_BENCHMARK_PROMPT_QUALITY ?? "unknown";
const requiredColumns = columnList(
requiredEnv("BIGSET_BENCHMARK_REQUIRED_COLUMNS")
);
const minimumRequiredColumns = columnList(
process.env.BIGSET_BENCHMARK_MINIMUM_REQUIRED_COLUMNS ?? ""
);

const missingRuntimeKeys = ["OPENROUTER_API_KEY", "TINYFISH_API_KEY"].filter(
(name) => !process.env[name]
);
if (missingRuntimeKeys.length > 0) {
console.log(JSON.stringify({
rows: [],
validationIssues: [
`Missing ${missingRuntimeKeys.join(", ")} for collection self-healing benchmark.`,
],
usage: emptyUsage(),
metrics: emptyMetrics(),
}));
process.exit(0);
}

const collectionRunner = await loadCollectionRunner();
if (!collectionRunner) {
console.log(JSON.stringify({
rows: [],
validationIssues: [
"Collection self-healing benchmark runner is not configured. Set BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE to a module exporting runCollectionPopulatePipeline(input).",
],
usage: emptyUsage(),
metrics: emptyMetrics(),
}));
process.exit(0);
}

const {
diagnosticRunForTick,
validationIssuesForSelfHealingTick,
} = await import(
"../../../backend/src/pipeline/populate-self-healing-runner.ts"
);
const {
DefaultPopulateRecipeAuthor,
InMemoryPopulateRecipeStore,
SelfHealingPopulateRecipeService,
} = await import(
"../../../backend/src/pipeline/populate-self-healing.ts"
);
const {
CollectionPopulateRecipeRuntime,
} = await import(
"../../../backend/src/pipeline/populate-collection-runtime.ts"
);

const context = {
datasetId: `benchmark-${safeIdSegment(promptId)}`,
datasetName: `benchmark_${safeIdSegment(promptId)}`,
description: prompt,
columns: requiredColumns.map((columnName) => ({
name: columnName,
type: inferPopulateColumnType(columnName),
description: `Benchmark requested column for ${promptQuality} prompt.`,
})),
};
const service = new SelfHealingPopulateRecipeService({
store: new InMemoryPopulateRecipeStore(),
runtime: new CollectionPopulateRecipeRuntime({
runPipeline: collectionRunner,
targetRows: Number(process.env.BIGSET_COLLECTION_BENCHMARK_MAX_ROWS ?? "10"),
}),
author: new DefaultPopulateRecipeAuthor(),
});
const tick = await service.tick({ datasetId: context.datasetId, context });
const result = diagnosticRunForTick(tick);

console.log(JSON.stringify({
rows: result?.rows ?? [],
validationIssues: [
...validationIssuesForSelfHealingTick(tick),
...minimumColumnIssues(result?.rows ?? []),
],
usage: result?.usage ?? emptyUsage(),
metrics: result?.metrics ?? emptyMetrics(),
}));

async function loadCollectionRunner() {
const moduleSpecifier = process.env.BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE;
if (!moduleSpecifier) {
return undefined;
}
const moduleUrl = moduleSpecifier.startsWith(".") || moduleSpecifier.startsWith("/")
? pathToFileURL(resolve(moduleSpecifier)).href
: moduleSpecifier;
const loaded = await import(moduleUrl);
const runner = loaded.runCollectionPopulatePipeline ?? loaded.default;
if (typeof runner !== "function") {
throw new Error(
`${moduleSpecifier} must export runCollectionPopulatePipeline(input) or a default runner.`
);
}
return runner;
}

function minimumColumnIssues(rows) {
const issues = [];
for (const [rowIndex, row] of rows.entries()) {
for (const columnName of minimumRequiredColumns) {
const value = row.cells?.[columnName];
if (value === undefined || value === null || value === "") {
issues.push(`Row ${rowIndex} missing minimum required column ${columnName}.`);
}
}
}
return issues;
}

function inferPopulateColumnType(columnName) {
if (/(url|website|link|page)$/i.test(columnName)) return "url";
if (/(date|_at)$/i.test(columnName)) return "date";
if (/^(is_|has_|can_)/i.test(columnName)) return "boolean";
if (/(count|price|amount|score|number|total)/i.test(columnName)) return "number";
return "text";
}

function safeIdSegment(value) {
return String(value).replace(/[^a-zA-Z0-9._-]/g, "_").slice(0, 80);
}

function columnList(value) {
return value
.split(",")
.map((columnName) => columnName.trim())
.filter(Boolean);
}

function emptyUsage() {
return { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
}

function emptyMetrics() {
return {
searchCalls: 0,
fetchCalls: 0,
browserCalls: 0,
agentRuns: 0,
agentSteps: 0,
};
}

function requiredEnv(name) {
const value = process.env[name];
if (!value) {
throw new Error(`Missing ${name}. Run through run-benchmark.mjs.`);
}
return value;
}
22 changes: 15 additions & 7 deletions scripts/verify-self-healing-stack.sh
Original file line number Diff line number Diff line change
Expand Up @@ -90,18 +90,20 @@ check_convex_ready() {
}

run_blocked_benchmark_smoke() {
local out_dir="benchmark-results/self-healing-blocked-smoke-$(date +%Y%m%d-%H%M%S)"
local system_name="$1"
local system_command="$2"
local out_dir="benchmark-results/${system_name}-blocked-smoke-$(date +%Y%m%d-%H%M%S)"
local stdout_file="${out_dir}/runner-stdout.json"

mkdir -p "$out_dir"
printf 'RUN mastra benchmark no-key blocked smoke\n'
printf 'RUN %s benchmark no-key blocked smoke\n' "$system_name"
if ! env -u OPENROUTER_API_KEY -u TINYFISH_API_KEY node benchmarks/dataset-agent/run-benchmark.mjs \
--prompt-ids latest-ai-blog-posts \
--timeout-ms 60000 \
--out "$out_dir" \
--system "mastra=node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs" \
--system "${system_name}=${system_command}" \
> "$stdout_file"; then
mark_fail "mastra benchmark no-key blocked smoke"
mark_fail "${system_name} benchmark no-key blocked smoke"
return
fi

Expand Down Expand Up @@ -154,9 +156,9 @@ for (const result of summary.laneResults ?? []) {
}
}
' "${out_dir}/summary.json"; then
mark_pass "mastra benchmark no-key blocked smoke (${out_dir})"
mark_pass "${system_name} benchmark no-key blocked smoke (${out_dir})"
else
mark_fail "mastra benchmark no-key blocked smoke"
mark_fail "${system_name} benchmark no-key blocked smoke"
fi
}

Expand Down Expand Up @@ -253,10 +255,16 @@ if [[ "$SHOULD_RUN_LOCAL_GATES" -eq 1 ]]; then
run_required_step "backend tests" npm --prefix backend test
run_required_step "backend build" npm --prefix backend run build
run_required_step "mastra adapter syntax" node --check benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs
run_required_step "collection adapter syntax" node --check benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs
fi

if [[ "$SHOULD_RUN_BLOCKED_BENCHMARK_SMOKE" -eq 1 ]]; then
run_blocked_benchmark_smoke
run_blocked_benchmark_smoke \
"mastra" \
"node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs"
run_blocked_benchmark_smoke \
"collection-self-heal" \
"node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs"
fi

if [[ "$SHOULD_RUN_CONVEX_PUSH" -eq 1 ]]; then
Expand Down