tinyfish-io · giaphutran12 · May 22, 2026
diff --git a/benchmarks/dataset-agent/README.md b/benchmarks/dataset-agent/README.md
@@ -21,6 +21,25 @@ Real Mastra benchmark runs require `OPENROUTER_API_KEY` and `TINYFISH_API_KEY`
 loaded execution-only. If either is missing, the adapter returns a blocked
 benchmark result instead of touching app data.
 
+## Run Collection Inside Self-Healing
+
+The collection adapter uses the same benchmark runner, but wraps
+`CollectionPopulateRecipeRuntime` inside `SelfHealingPopulateRecipeService`.
+That means collection results are scored after the same recipe generation,
+repair, validation, and promotion path as the app runtime.
+
+```bash
+node benchmarks/dataset-agent/run-benchmark.mjs \
+  --prompt-ids latest-ai-blog-posts,saas-pricing-pages \
+  --system collection-self-heal='node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs'
+```
+
+Real collection benchmark runs require `OPENROUTER_API_KEY`,
+`TINYFISH_API_KEY`, and `BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE` loaded in
+the shell. The runner module must export `runCollectionPopulatePipeline(input)`
+or a default runner that accepts `CollectionPopulatePipelineInput` and returns a
+`PopulateRuntimeResult`.
+
 ## Verify Self-Healing Stack
 
 Use this before asking someone else to migrate a new collection agent into the
@@ -30,9 +49,9 @@ app path:
 make verify-self-healing
 ```
 
-That command runs backend tests, backend build, adapter syntax checks, and a
-no-key benchmark smoke that must produce a clean `blocked` result without
-spending OpenRouter or TinyFish credits.
+That command runs backend tests, backend build, adapter syntax checks, and
+Mastra + collection no-key benchmark smokes that must produce clean `blocked`
+results without spending OpenRouter or TinyFish credits.
 
 Live checks are explicit:
 

diff --git a/benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs b/benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs
@@ -0,0 +1,163 @@
+#!/usr/bin/env node
+import { pathToFileURL } from "node:url";
+import { resolve } from "node:path";
+
+const prompt = requiredEnv("BIGSET_BENCHMARK_PROMPT");
+const promptId = process.env.BIGSET_BENCHMARK_PROMPT_ID ?? "benchmark-prompt";
+const promptQuality = process.env.BIGSET_BENCHMARK_PROMPT_QUALITY ?? "unknown";
+const requiredColumns = columnList(
+  requiredEnv("BIGSET_BENCHMARK_REQUIRED_COLUMNS")
+);
+const minimumRequiredColumns = columnList(
+  process.env.BIGSET_BENCHMARK_MINIMUM_REQUIRED_COLUMNS ?? ""
+);
+
+const missingRuntimeKeys = ["OPENROUTER_API_KEY", "TINYFISH_API_KEY"].filter(
+  (name) => !process.env[name]
+);
+if (missingRuntimeKeys.length > 0) {
+  console.log(JSON.stringify({
+    rows: [],
+    validationIssues: [
+      `Missing ${missingRuntimeKeys.join(", ")} for collection self-healing benchmark.`,
+    ],
+    usage: emptyUsage(),
+    metrics: emptyMetrics(),
+  }));
+  process.exit(0);
+}
+
+const collectionRunner = await loadCollectionRunner();
+if (!collectionRunner) {
+  console.log(JSON.stringify({
+    rows: [],
+    validationIssues: [
+      "Collection self-healing benchmark runner is not configured. Set BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE to a module exporting runCollectionPopulatePipeline(input).",
+    ],
+    usage: emptyUsage(),
+    metrics: emptyMetrics(),
+  }));
+  process.exit(0);
+}
+
+const {
+  diagnosticRunForTick,
+  validationIssuesForSelfHealingTick,
+} = await import(
+  "../../../backend/src/pipeline/populate-self-healing-runner.ts"
+);
+const {
+  DefaultPopulateRecipeAuthor,
+  InMemoryPopulateRecipeStore,
+  SelfHealingPopulateRecipeService,
+} = await import(
+  "../../../backend/src/pipeline/populate-self-healing.ts"
+);
+const {
+  CollectionPopulateRecipeRuntime,
+} = await import(
+  "../../../backend/src/pipeline/populate-collection-runtime.ts"
+);
+
+const context = {
+  datasetId: `benchmark-${safeIdSegment(promptId)}`,
+  datasetName: `benchmark_${safeIdSegment(promptId)}`,
+  description: prompt,
+  columns: requiredColumns.map((columnName) => ({
+    name: columnName,
+    type: inferPopulateColumnType(columnName),
+    description: `Benchmark requested column for ${promptQuality} prompt.`,
+  })),
+};
+const service = new SelfHealingPopulateRecipeService({
+  store: new InMemoryPopulateRecipeStore(),
+  runtime: new CollectionPopulateRecipeRuntime({
+    runPipeline: collectionRunner,
+    targetRows: Number(process.env.BIGSET_COLLECTION_BENCHMARK_MAX_ROWS ?? "10"),
+  }),
+  author: new DefaultPopulateRecipeAuthor(),
+});
+const tick = await service.tick({ datasetId: context.datasetId, context });
+const result = diagnosticRunForTick(tick);
+
+console.log(JSON.stringify({
+  rows: result?.rows ?? [],
+  validationIssues: [
+    ...validationIssuesForSelfHealingTick(tick),
+    ...minimumColumnIssues(result?.rows ?? []),
+  ],
+  usage: result?.usage ?? emptyUsage(),
+  metrics: result?.metrics ?? emptyMetrics(),
+}));
+
+async function loadCollectionRunner() {
+  const moduleSpecifier = process.env.BIGSET_COLLECTION_BENCHMARK_RUNNER_MODULE;
+  if (!moduleSpecifier) {
+    return undefined;
+  }
+  const moduleUrl = moduleSpecifier.startsWith(".") || moduleSpecifier.startsWith("/")
+    ? pathToFileURL(resolve(moduleSpecifier)).href
+    : moduleSpecifier;
+  const loaded = await import(moduleUrl);
+  const runner = loaded.runCollectionPopulatePipeline ?? loaded.default;
+  if (typeof runner !== "function") {
+    throw new Error(
+      `${moduleSpecifier} must export runCollectionPopulatePipeline(input) or a default runner.`
+    );
+  }
+  return runner;
+}
+
+function minimumColumnIssues(rows) {
+  const issues = [];
+  for (const [rowIndex, row] of rows.entries()) {
+    for (const columnName of minimumRequiredColumns) {
+      const value = row.cells?.[columnName];
+      if (value === undefined || value === null || value === "") {
+        issues.push(`Row ${rowIndex} missing minimum required column ${columnName}.`);
+      }
+    }
+  }
+  return issues;
+}
+
+function inferPopulateColumnType(columnName) {
+  if (/(url|website|link|page)$/i.test(columnName)) return "url";
+  if (/(date|_at)$/i.test(columnName)) return "date";
+  if (/^(is_|has_|can_)/i.test(columnName)) return "boolean";
+  if (/(count|price|amount|score|number|total)/i.test(columnName)) return "number";
+  return "text";
+}
+
+function safeIdSegment(value) {
+  return String(value).replace(/[^a-zA-Z0-9._-]/g, "_").slice(0, 80);
+}
+
+function columnList(value) {
+  return value
+    .split(",")
+    .map((columnName) => columnName.trim())
+    .filter(Boolean);
+}
+
+function emptyUsage() {
+  return { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
+}
+
+function emptyMetrics() {
+  return {
+    searchCalls: 0,
+    fetchCalls: 0,
+    browserCalls: 0,
+    agentRuns: 0,
+    agentSteps: 0,
+  };
+}
+
+function requiredEnv(name) {
+  const value = process.env[name];
+  if (!value) {
+    throw new Error(`Missing ${name}. Run through run-benchmark.mjs.`);
+  }
+  return value;
+}
diff --git a/scripts/verify-self-healing-stack.sh b/scripts/verify-self-healing-stack.sh
@@ -90,18 +90,20 @@ check_convex_ready() {
 }
 
 run_blocked_benchmark_smoke() {
-  local out_dir="benchmark-results/self-healing-blocked-smoke-$(date +%Y%m%d-%H%M%S)"
+  local system_name="$1"
+  local system_command="$2"
+  local out_dir="benchmark-results/${system_name}-blocked-smoke-$(date +%Y%m%d-%H%M%S)"
   local stdout_file="${out_dir}/runner-stdout.json"
 
   mkdir -p "$out_dir"
-  printf 'RUN   mastra benchmark no-key blocked smoke\n'
+  printf 'RUN   %s benchmark no-key blocked smoke\n' "$system_name"
   if ! env -u OPENROUTER_API_KEY -u TINYFISH_API_KEY node benchmarks/dataset-agent/run-benchmark.mjs \
     --prompt-ids latest-ai-blog-posts \
     --timeout-ms 60000 \
     --out "$out_dir" \
-    --system "mastra=node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs" \
+    --system "${system_name}=${system_command}" \
     > "$stdout_file"; then
-    mark_fail "mastra benchmark no-key blocked smoke"
+    mark_fail "${system_name} benchmark no-key blocked smoke"
     return
   fi
 
@@ -154,9 +156,9 @@ for (const result of summary.laneResults ?? []) {
   }
 }
 ' "${out_dir}/summary.json"; then
-    mark_pass "mastra benchmark no-key blocked smoke (${out_dir})"
+    mark_pass "${system_name} benchmark no-key blocked smoke (${out_dir})"
   else
-    mark_fail "mastra benchmark no-key blocked smoke"
+    mark_fail "${system_name} benchmark no-key blocked smoke"
   fi
 }
 
@@ -253,10 +255,16 @@ if [[ "$SHOULD_RUN_LOCAL_GATES" -eq 1 ]]; then
   run_required_step "backend tests" npm --prefix backend test
   run_required_step "backend build" npm --prefix backend run build
   run_required_step "mastra adapter syntax" node --check benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs
+  run_required_step "collection adapter syntax" node --check benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs
 fi
 
 if [[ "$SHOULD_RUN_BLOCKED_BENCHMARK_SMOKE" -eq 1 ]]; then
-  run_blocked_benchmark_smoke
+  run_blocked_benchmark_smoke \
+    "mastra" \
+    "node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/mastra-populate-adapter.mjs"
+  run_blocked_benchmark_smoke \
+    "collection-self-heal" \
+    "node --import ./backend/node_modules/tsx/dist/esm/index.mjs benchmarks/dataset-agent/adapters/collection-self-healing-adapter.mjs"
 fi
 
 if [[ "$SHOULD_RUN_CONVEX_PUSH" -eq 1 ]]; then