tinyfish-io · giaphutran12 · May 22, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
diff --git a/.env.example b/.env.example
@@ -9,11 +9,19 @@ CLERK_SECRET_KEY=sk_test_...
 # Generate at https://openrouter.ai/settings/keys
 OPENROUTER_API_KEY=sk-or-...
 
+# TinyFish — required by populate agent web search/fetch.
+# Generate at https://agent.tinyfish.ai/api-keys
+TINYFISH_API_KEY=
+
 # Generate once after the first `make dev` with:
 #   docker compose exec convex ./generate_admin_key.sh
 # Used by the backend container to call internal Convex functions.
 CONVEX_SELF_HOSTED_ADMIN_KEY=
 
+# Durable store for self-healing populate recipe manifests.
+# Docker dev overrides this to /app/.bigset/populate-recipes on a named volume.
+POPULATE_RECIPE_STORE_DIR=.bigset/populate-recipes
+
 # PostHog (optional — leave blank to disable analytics entirely in local dev).
 # Get from https://us.posthog.com/project/settings/general.
 NEXT_PUBLIC_POSTHOG_KEY=

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 .DS_Store
 node_modules/
+backend/node_modules
 .env
 .env.local
 Project_BigSet_brief.md
@@ -14,16 +15,18 @@ Project_BigSet_brief.md
 *.log
 npm-debug.log*
 yarn-debug.log*
+/benchmark-results/
 
 # Local-only files
 *.bak
 tmp/
 temp/
 
 .mastra
+.bigset/
 
 # Local tarballs
 *.tgz
 
 # Internal docs
-BigSet Technical Specs & Goals.md
+BigSet Technical Specs & Goals.md
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -29,7 +29,7 @@ Backend is Fastify + Mastra. Fastify serves the HTTP API (Clerk JWT auth on prot
 
 The schema inference pipeline: frontend calls `POST /infer-schema` → Fastify verifies the Clerk JWT → calls `inferSchema()` in `backend/src/pipeline/schema-inference.ts` → Claude Sonnet 4.6 via OpenRouter → returns a Zod-validated `DatasetSchema` → frontend maps it to editable columns in the wizard.
 
-The populate pipeline: frontend calls `POST /populate` with `{ datasetId, datasetName, description, columns }` → Fastify verifies the Clerk JWT → triggers `populateWorkflow` which: (1) clears existing rows, (2) builds a prompt from the schema, (3) runs the populate agent (Claude Sonnet 4.6) which searches the web via TinyFish APIs, then inserts rows into Convex one by one. Rows appear in realtime on the frontend via Convex reactive queries.
+The populate pipeline: frontend calls `POST /populate` with `{ datasetId, datasetName, description, columns }` → Fastify verifies the Clerk JWT → runs the self-healing populate service. The service builds or reuses a recipe, runs the Mastra populate runtime against TinyFish search/fetch, validates source-backed rows, repairs bad recipes, promotes the passing recipe, then atomically replaces the dataset rows in Convex. Rows appear in realtime on the frontend via Convex reactive queries.
 
 Convex functions use `ctx.auth.getUserIdentity()` to get the authenticated user. The `ownerId` field on datasets stores `identity.subject` (Clerk user ID). Do not pass `ownerId` from the client.
 
@@ -49,4 +49,10 @@ Convex is self-hosted — it does NOT hot-reload when you edit files in `fronten
 
 In CI/prod, run `npx convex deploy` with `CONVEX_SELF_HOSTED_URL` and `CONVEX_SELF_HOSTED_ADMIN_KEY` set as env vars.
 
+## Self-Healing Verification
+
+Run `make verify-self-healing` before handing the stack to another agent. It runs backend tests, backend build, adapter syntax checks, and a no-key benchmark smoke that should block cleanly without spending API credits.
+
+Use `bash scripts/verify-self-healing-stack.sh --real-benchmark` for the 2-prompt real Mastra benchmark, and `bash scripts/verify-self-healing-stack.sh --convex-push --dataset-id <dataset-id>` for a live app dataset dry-run. Export the required env vars before live modes; the verifier does not parse secret files itself. Add `--commit` only when you intentionally want to replace rows.
+
 This is an open-source (AGPL) project. Do not commit secrets, API keys, or internal docs.
diff --git a/backend/.env.example b/backend/.env.example
@@ -1,6 +1,7 @@
 CLIENT_ORIGIN=http://localhost:3500
 CONVEX_URL=http://localhost:3210
 PORT=3501
+POPULATE_RECIPE_STORE_DIR=.bigset/populate-recipes
 
 # Required once the backend starts writing rows via internal Convex mutations.
 # Generate with: docker compose exec convex ./generate_admin_key.sh

diff --git a/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts b/backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts
@@ -0,0 +1,114 @@
+import type { FetchedPage } from "../models/schemas.js";
+import type { WorkflowMemory } from "../memory/types.js";
+import { domainMemoryBoost } from "../memory/workflow-memory.js";
+import { getDomain, normalizeUrl } from "../utils/url.js";
+
+const SKIP_HOST =
+  /(?:facebook|twitter|x\.com|instagram|youtube|tiktok|pinterest|reddit\.com\/r\/|linkedin\.com\/in\/|accounts\.google|login|signin|signup|register|cookie|privacy|terms|cdn\.|static\.|fonts\.)/i;
+const SKIP_EXT = /\.(?:pdf|zip|png|jpe?g|gif|svg|webp|css|js|woff2?|xml|mp4|mp3)(?:\?|$)/i;
+const POSITIVE_PATH =
+  /\/(?:blog|news|docs|documentation|pricing|billing|investor|investors|earnings|financial|reports|press|release|releases|mcp|model-context-protocol|agents|company|companies|startup|startups|portfolio|team|about|careers|jobs|directory|list|batch|founder|org|organization|profile|detail|view)(?:\/|$|\?)/i;
+const NEGATIVE_PATH =
+  /\/(?:tag|tags|category|categories|author|feed|rss|search|wp-admin|wp-content)(?:\/|$|\?)/i;
+
+export interface LinkFollowOptions {
+  pages: FetchedPage[];
+  excludeUrls: Set<string>;
+  focusFields?: string[];
+  maxTotal: number;
+  maxPerSource: number;
+  memory?: WorkflowMemory;
+}
+
+function pathTokensFromFields(fields?: string[]): string[] {
+  if (!fields?.length) return [];
+  return fields
+    .flatMap((field) =>
+      field
+        .split(/[_\s-]+/)
+        .map((part) => part.toLowerCase())
+        .filter((part) => part.length > 3),
+    )
+    .slice(0, 12);
+}
+
+function scoreLink(
+  link: string,
+  sourceDomain: string,
+  focusTokens: string[],
+  memory?: WorkflowMemory,
+): number {
+  let score = 0;
+
+  try {
+    const parsed = new URL(link);
+    const host = parsed.hostname.toLowerCase();
+    const path = `${parsed.pathname}${parsed.search}`.toLowerCase();
+
+    if (SKIP_HOST.test(host) || SKIP_EXT.test(path)) return -1000;
+    if (NEGATIVE_PATH.test(path)) score -= 2;
+    if (POSITIVE_PATH.test(path)) score += 4;
+
+    const linkDomain = getDomain(link);
+    if (linkDomain === sourceDomain) score += 3;
+    else if (linkDomain.endsWith(`.${sourceDomain}`) || sourceDomain.endsWith(`.${linkDomain}`)) {
+      score += 2;
+    }
+
+    for (const token of focusTokens) {
+      if (path.includes(token)) score += 2;
+    }
+
+    if (memory) score += domainMemoryBoost(memory, linkDomain);
+
+    if (path.length > 120) score -= 1;
+    if (parsed.hash.length > 1) score -= 1;
+  } catch {
+    return -1000;
+  }
+
+  return score;
+}
+
+/** Pick outbound links from high-value pages using URL heuristics only. */
+export function selectOutboundLinksToFollow(
+  options: LinkFollowOptions,
+): string[] {
+  const focusTokens = pathTokensFromFields(options.focusFields);
+  const selected: string[] = [];
+  const selectedSet = new Set<string>();
+
+  const pagesWithLinks = options.pages
+    .filter((page) => !page.error && page.outbound_links && page.outbound_links.length > 0)
+    .sort((a, b) => (b.outbound_links?.length ?? 0) - (a.outbound_links?.length ?? 0));
+
+  for (const page of pagesWithLinks) {
+    const sourceUrl = normalizeUrl(page.final_url || page.url);
+    const sourceDomain = getDomain(sourceUrl);
+    let perSource = 0;
+
+    const ranked = [...(page.outbound_links ?? [])]
+      .map((link) => ({
+        link,
+        score: scoreLink(link, sourceDomain, focusTokens, options.memory),
+      }))
+      .filter((item) => item.score > 0)
+      .sort((a, b) => b.score - a.score);
+
+    for (const { link } of ranked) {
+      if (selected.length >= options.maxTotal) return selected;
+      if (perSource >= options.maxPerSource) break;
+
+      const normalized = normalizeUrl(link);
+      if (options.excludeUrls.has(normalized)) continue;
+      if (selectedSet.has(normalized)) continue;
+      if (normalized === sourceUrl) continue;
+
+      selectedSet.add(normalized);
+      selected.push(link);
+      perSource += 1;
+    }
+  }
+
+  return selected;
+}
diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts b/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts
@@ -0,0 +1,64 @@
+import { completeJson } from "../integrations/openrouter.js";
+import {
+  memoryContextForAgents,
+  type WorkflowMemory,
+} from "../memory/index.js";
+import { agentGoalSchema, type AgentGoal } from "../models/schemas.js";
+import type { DatasetSpec, SourceTriageResult } from "../models/schemas.js";
+
+const AGENT_GOAL_SYSTEM = `You are the Navigation Task Agent for a web data collection pipeline.
+
+Write a Tinyfish Agent goal: a clear natural-language instruction for browser automation on the given URL.
+
+The agent must navigate the site and return structured JSON with extracted data matching the dataset schema.
+
+Rules:
+- Be specific about what to click, search, filter, or paginate.
+- State the exact JSON shape to return: { "records": [ { column_name: value, ... } ] }
+- Include column names from the schema in the goal.
+- For forms: describe fields to fill and how to submit.
+- For detail follow-up: explain how to open each item and which fields to collect.
+- Limit scope (e.g. first 25 rows) to keep runs reliable.
+- Do not invent data; extract only what is visible on the site.
+- When workflow_memory is provided, reuse goal patterns from agent_goal_stats_top (high avg_completeness/confidence); avoid domains in domain_stats_weak unless diagnosis says otherwise.
+- If latest_diagnosis.prefer_tinyfish_agent or agent_strategy_notes exist, follow them.
+- Return ONLY JSON with fields: goal, rationale`;
+
+export async function generateAgentGoal(options: {
+  userPrompt: string;
+  spec: DatasetSpec;
+  triage: SourceTriageResult;
+  focusFields?: string[];
+  memory?: WorkflowMemory;
+}): Promise<AgentGoal> {
+  const columnList = options.spec.columns
+    .map((c) => `${c.name} (${c.type}${c.required ? ", required" : ""})`)
+    .join(", ");
+
+  return completeJson({
+    label: `agent_goal:${options.triage.final_url}`,
+    schema: agentGoalSchema,
+    messages: [
+      { role: "system", content: AGENT_GOAL_SYSTEM },
+      {
+        role: "user",
+        content: JSON.stringify({
+          user_prompt: options.userPrompt,
+          triage_status: options.triage.status,
+          triage_reasoning: options.triage.reasoning,
+          suggested_action: options.triage.suggested_action,
+          page_url: options.triage.final_url,
+          page_title: options.triage.title,
+          row_grain: options.spec.row_grain,
+          columns: columnList,
+          focus_fields: options.focusFields ?? [],
+          extraction_hints: options.spec.extraction_hints,
+          workflow_memory: options.memory
+            ? memoryContextForAgents(options.memory)
+            : undefined,
+          output_shape: { goal: "string", rationale: "string" },
+        }),
+      },
+    ],
+  });
+}
diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/benchmark-spec.ts b/backend/BigSet_Data_Collection_Agent/src/agents/benchmark-spec.ts
@@ -0,0 +1,94 @@
+import type { ColumnDef, DatasetSpec } from "../models/schemas.js";
+import { normalizeSpecColumnOrder } from "./dataset-spec.js";
+
+/** Benchmark harness fields from prompts.json (via env in adapters). */
+export interface BenchmarkSpecContext {
+  promptId?: string;
+  promptQuality?: string;
+  persona?: string;
+  expectedStress?: string;
+  requiredColumns: string[];
+}
+
+export function hasBenchmarkRequiredColumns(
+  context?: BenchmarkSpecContext,
+): context is BenchmarkSpecContext & { requiredColumns: string[] } {
+  return Boolean(context?.requiredColumns?.length);
+}
+
+/** Parse comma-separated column names (CLI flag or benchmark env). */
+export function parseRequiredColumns(value: string): string[] {
+  const columns = value
+    .split(",")
+    .map((name) => name.trim())
+    .filter(Boolean);
+  if (columns.length === 0) {
+    throw new Error(
+      "Required columns must include at least one non-empty column name.",
+    );
+  }
+  return columns;
+}
+
+/**
+ * Ensures every benchmark-required column name exists on the spec as required.
+ * Types and descriptions come from the dataset-spec LLM when present; otherwise
+ * minimal placeholders (no per-column name heuristics).
+ */
+export function mergeSpecWithBenchmarkRequiredColumns(
+  spec: DatasetSpec,
+  context: BenchmarkSpecContext,
+): DatasetSpec {
+  const requiredColumns = context.requiredColumns;
+  const columnsByName = new Map(spec.columns.map((column) => [column.name, column]));
+
+  const requiredColumnDefs: ColumnDef[] = requiredColumns.map((name) => {
+    const existing = columnsByName.get(name);
+    if (existing) {
+      return { ...existing, required: true };
+    }
+    return {
+      name,
+      type: "string",
+      description: name,
+      required: true,
+    };
+  });
+
+  const optionalExtras = spec.columns.filter(
+    (column) => !requiredColumns.includes(column.name),
+  );
+
+  const columns = [...requiredColumnDefs, ...optionalExtras];
+  const columnNames = new Set(columns.map((column) => column.name));
+
+  const isEntityLikeColumn = (name: string): boolean =>
+    /(entity|company|organization|business|restaurant|bakery|provider|product|name|title)/i.test(
+      name,
+    );
+
+  const dedupeKey =
+    requiredColumns.find(
+      (name) => columnNames.has(name) && isEntityLikeColumn(name),
+    ) ??
+    spec.dedupe_keys.find((key) => columnNames.has(key)) ??
+    requiredColumns.find((name) => columnNames.has(name)) ??
+    spec.dedupe_keys[0];
+
+  const extractionHints = [
+    spec.extraction_hints,
+    `Benchmark required columns (use as exact row keys): ${requiredColumns.join(", ")}.`,
+    context.expectedStress
+      ? `Benchmark stress note: ${context.expectedStress}`
+      : undefined,
+  ]
+    .filter(Boolean)
+    .join("\n");
+
+  return normalizeSpecColumnOrder({
+    ...spec,
+    columns,
+    dedupe_keys: dedupeKey ? [dedupeKey] : spec.dedupe_keys,
+    extraction_hints: extractionHints,
+  });
+}