tinyfish-io · giaphutran12 · May 22, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
diff --git a/.env.example b/.env.example
@@ -1,14 +1,30 @@
-# These are read by docker-compose.dev.yml.
+# This is the only local env file BigSet expects.
 # Copy this file to .env and fill in your values.
 
+# Local service URLs
+CLIENT_ORIGIN=http://localhost:3500
+CONVEX_URL=http://localhost:3210
+NEXT_PUBLIC_CONVEX_URL=http://127.0.0.1:3210
+CONVEX_SELF_HOSTED_URL=http://127.0.0.1:3210
+NEXT_PUBLIC_BACKEND_URL=http://localhost:3501
+PORT=3501
+
 # Clerk — create a free app at https://dashboard.clerk.com
+# Enable the Clerk JWT Templates -> Convex template, then set your issuer URL.
 NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=pk_test_...
 CLERK_SECRET_KEY=sk_test_...
+CLERK_JWT_ISSUER_DOMAIN=https://your-app.clerk.accounts.dev
 
 # OpenRouter — required by backend + Mastra for AI model calls.
 # Generate at https://openrouter.ai/settings/keys
 OPENROUTER_API_KEY=sk-or-...
 
+# Optional model overrides.
+# Schema inference defaults to anthropic/claude-sonnet-4-6.
+# Populate and other non-inference tasks default to google/gemini-3.1-flash-lite.
+# OPENROUTER_MODEL=google/gemini-3.1-flash-lite
+# OPENROUTER_POPULATE_MODEL=google/gemini-3.1-flash-lite
+
 # TinyFish — required by populate agent web search/fetch.
 # Generate at https://agent.tinyfish.ai/api-keys
 TINYFISH_API_KEY=
@@ -22,6 +38,31 @@ CONVEX_SELF_HOSTED_ADMIN_KEY=
 # Docker dev overrides this to /app/.bigset/populate-recipes on a named volume.
 POPULATE_RECIPE_STORE_DIR=.bigset/populate-recipes
 
+# Populate runtime limits.
+# POPULATE_MAX_ROWS=100
+# POPULATE_MAX_SEARCH_CALLS=25
+# POPULATE_MAX_FETCH_CALLS=50
+# POPULATE_COMMIT_ROW_LIMIT_PER_HOUR=1000
+
+# Browser-action self-healing. Non-secret tunables.
+POPULATE_ENABLE_BROWSER_ACTION_BOX=true
+POPULATE_BROWSER_ACTION_BOX_POLL_INTERVAL_MS=3000
+POPULATE_ENABLE_PLAYWRIGHT_REPLAY=true
+POPULATE_ENABLE_PLAYWRIGHT_REPAIR=true
+POPULATE_PLAYWRIGHT_HEADLESS=true
+# POPULATE_PLAYWRIGHT_EXECUTABLE_PATH=
+
+# Collection-agent canaries. Leave Mastra as the default app runtime unless
+# intentionally benchmarking the collection runner.
+# POPULATE_AGENT_RUNTIME=collection
+# POPULATE_COLLECTION_RUNNER_MODULE=./backend/src/pipeline/collection-agent-runner.ts
+COLLECTION_AGENT_PIPELINE_MODULE=./backend/BigSet_Data_Collection_Agent/src/orchestrator/pipeline.ts
+COLLECTION_AGENT_ENABLE_TRIAGE=true
+COLLECTION_AGENT_ENABLE_AGENT=false
+COLLECTION_AGENT_POLL_TIMEOUT_MS=1200000
+AGENT_POLL_TIMEOUT_MS=1200000
+AGENT_REQUEST_TIMEOUT_MS=15000
+
 # PostHog (optional — leave blank to disable analytics entirely in local dev).
 # Get from https://us.posthog.com/project/settings/general.
 NEXT_PUBLIC_POSTHOG_KEY=

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -8,14 +8,14 @@ Frontend on :3500, backend on :3501, Mastra Studio on :4111, Convex dashboard on
 
 1. Create a free Clerk account at https://clerk.com and create an application.
 2. In the Clerk dashboard, go to **JWT Templates** and enable the **Convex** template.
-3. Copy `frontend/.env.example` to `frontend/.env.local` and fill in your Clerk keys:
+3. Copy `.env.example` to `.env` and fill in your Clerk keys:
    - `NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY` — from Clerk API Keys
    - `CLERK_SECRET_KEY` — from Clerk API Keys
    - `CLERK_JWT_ISSUER_DOMAIN` — your Frontend API URL (e.g. `https://your-app.clerk.accounts.dev`)
-4. Add an OpenRouter API key to the root `.env` file: `OPENROUTER_API_KEY=sk-or-...` (get one at https://openrouter.ai/settings/keys). Docker Compose reads the root `.env` and passes it to the backend and Mastra containers.
-4b. Add a TinyFish API key to the root `.env` file: `TINYFISH_API_KEY=...` (get one at https://agent.tinyfish.ai/api-keys). This enables the populate agent to search the web and fetch page content.
+4. Add an OpenRouter API key to `.env`: `OPENROUTER_API_KEY=sk-or-...` (get one at https://openrouter.ai/settings/keys). Docker Compose reads root `.env` and passes it to frontend, backend, and Mastra containers.
+4b. Add a TinyFish API key to `.env`: `TINYFISH_API_KEY=...` (get one at https://agent.tinyfish.ai/api-keys). This enables the populate agent to search the web and fetch page content.
 5. Run `make dev` — this starts all Docker services AND pushes Convex functions automatically.
-6. Generate a Convex admin key (first run only): `docker compose exec convex ./generate_admin_key.sh` and add it as `CONVEX_SELF_HOSTED_ADMIN_KEY` in `frontend/.env.local`, then re-run `make dev`.
+6. Generate a Convex admin key (first run only): `docker compose exec convex ./generate_admin_key.sh` and add it as `CONVEX_SELF_HOSTED_ADMIN_KEY` in `.env`, then re-run `make dev`.
 
 ## Architecture
 
@@ -35,13 +35,13 @@ Convex functions use `ctx.auth.getUserIdentity()` to get the authenticated user.
 
 ## Environment Variables
 
-Docker Compose interpolates variables from the root `.env` file. Key variables:
+Root `.env` is the only local env file. Docker Compose, package scripts, Convex helper targets, and benchmarks load it. Key variables:
 - `NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY`, `CLERK_SECRET_KEY` — shared by frontend and backend
 - `OPENROUTER_API_KEY` — used by backend and Mastra for AI model calls
 - `CONVEX_SELF_HOSTED_ADMIN_KEY` — used by backend for system-level Convex writes
 - `TINYFISH_API_KEY` — used by the populate agent for web search and fetch (get one at https://agent.tinyfish.ai/api-keys)
 
-The backend container maps `NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY` → `CLERK_PUBLISHABLE_KEY` (see `docker-compose.dev.yml`).
+The backend accepts `NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY` as the publishable Clerk key, and the Docker backend container also maps it to `CLERK_PUBLISHABLE_KEY` (see `docker-compose.dev.yml`).
 
 ## Convex Deploys
 

diff --git a/README.md b/README.md
@@ -44,16 +44,12 @@ cd bigset
 
 Create a Clerk application at [dashboard.clerk.com](https://dashboard.clerk.com), then go to **JWT Templates** and enable the **Convex** template.
 
-### 2. Configure env files
+### 2. Configure env
 
 ```bash
-# Root .env — used by Docker for the frontend container
 cp .env.example .env
-# Fill in NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY and CLERK_SECRET_KEY
-
-# Frontend .env.local — used by Next.js and Convex CLI
-cp frontend/.env.example frontend/.env.local
-# Fill in all three Clerk keys (publishable, secret, and JWT issuer domain)
+# Fill in NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY, CLERK_SECRET_KEY,
+# CLERK_JWT_ISSUER_DOMAIN, OPENROUTER_API_KEY, and TINYFISH_API_KEY
 ```
 
 > **Required for the create-dataset wizard:** set `OPENROUTER_API_KEY` (used by the schema-inference pipeline). Get one at [openrouter.ai](https://openrouter.ai). Without it the wizard's "Generate Schema" step will fail.
@@ -66,7 +62,11 @@ cp frontend/.env.example frontend/.env.local
 make dev
 ```
 
-This starts all Docker services, waits for Convex to be healthy, and deploys Convex functions automatically. Once it's up:
+This starts all Docker services, waits for Convex to be healthy, and deploys Convex functions automatically.
+`make dev` checks that root `.env` contains real Clerk/OpenRouter/TinyFish
+values before it starts Docker. If it reports a placeholder key, replace that
+value first.
+Once it is up:
 
 - App: http://localhost:3500
 - Convex dashboard: http://localhost:6791
@@ -78,26 +78,31 @@ This starts all Docker services, waits for Convex to be healthy, and deploys Con
 docker compose exec convex ./generate_admin_key.sh
 ```
 
-Paste the output into `frontend/.env.local` as `CONVEX_SELF_HOSTED_ADMIN_KEY`, then re-run `make dev`.
+Paste the output into `.env` as `CONVEX_SELF_HOSTED_ADMIN_KEY`, then re-run
+`make dev`.
+
+If `make dev` stops at `CONVEX_SELF_HOSTED_ADMIN_KEY is missing`, that means
+Docker/Convex is up far enough for you to run the command above. Generate the
+key, paste it into root `.env`, and run `make dev` again.
 
 ### 5. Load curated public datasets
 
 The landing page and the dashboard's "Curated" section read from a set of 9 system-owned datasets. Load them with:
 
 ```bash
-cd frontend
-npx convex run publicSeed:seedPublicDatasets
+make seed-public-datasets
 ```
 
 The script is **idempotent** — rerunning it skips datasets that already exist (matched by a stable `seedKey`, so renaming a curated dataset never creates a duplicate). To add a 10th curated dataset, append it to `PUBLIC_DATASETS` in [frontend/convex/publicSeed.ts](frontend/convex/publicSeed.ts) with a fresh `seedKey` and rerun the command. To replace existing curated content in place, pass `force: true`:
 
 ```bash
-npx convex run publicSeed:seedPublicDatasets '{"force":true}'
+cd frontend
+node ../scripts/with-root-env.mjs npx convex run publicSeed:seedPublicDatasets '{"force":true}'
 ```
 
 Open [localhost:3500](http://localhost:3500) and click **Get started** to sign in.
 
-> **Note:** Backend env needs no setup — `backend/.env.example` has correct defaults. If you edit Convex functions in `frontend/convex/`, run `make convex-push` to deploy the changes.
+> **Note:** root `.env` is the only local env file. If you edit Convex functions in `frontend/convex/`, run `make convex-push` to deploy the changes.
 
 > **Free tier:** each signed-in account gets **2,500 row operations per calendar month** (resets on the 1st, UTC). The header shows a live usage badge; system-owned curated datasets bypass the quota.
 
@@ -123,12 +128,11 @@ Open [localhost:3500](http://localhost:3500) and click **Get started** to sign i
 bigset/
 ├── frontend/            Next.js 16 — UI + Convex schema & functions
 │   ├── convex/          Convex functions, schema, authz + quota helpers
-│   └── .env.local       Clerk + Convex keys (not committed)
 ├── backend/             Fastify + Mastra — schema inference + (future) agents
 │   ├── src/pipeline/    Pure schema-inference fn (called by Fastify + Mastra)
 │   └── src/mastra/      Mastra workflows (Studio at :4111 in dev)
 ├── scripts/             One-off scripts (e.g. verify-authz.sh)
-├── .env                 Clerk keys for docker-compose (not committed)
+├── .env                 Local env for frontend, backend, Convex CLI, benchmarks (not committed)
 ├── docker-compose.dev.yml
 └── Makefile
 ```

diff --git a/backend/.env.example b/backend/.env.example
diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts b/backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts
@@ -5,6 +5,15 @@ import {
 } from "../memory/index.js";
 import { agentGoalSchema, type AgentGoal } from "../models/schemas.js";
 import type { DatasetSpec, SourceTriageResult } from "../models/schemas.js";
+import type { LlmMessage } from "../integrations/openrouter.js";
+
+export const AGENT_BROWSER_ACTION_CONTRACT = `Browser action reporting contract:
+- The Tinyfish Agent result JSON MUST include "agent_browser_actions" next to "records".
+- "agent_browser_actions" is an ordered array of browser steps the agent actually performed.
+- Each action should use this shape when known: { "action": "navigate|click|type|select|wait|extract|screenshot|unknown", "url": "current page URL", "selector": "CSS selector when known", "target_text": "visible button/link/field text when known", "value_description": "safe description of typed/selected value, never secrets", "status": "succeeded|failed", "error": "failure reason if any", "phase": "initial|search|filter|pagination|detail|form|extract", "label": "short human label" }.
+- Record navigation, clicks, form fills, pagination, waits that affected extraction, and final extraction.
+- If a selector is unknown, still include url plus target_text when visible. If no browser action happened, return an empty array.
+- Do not include raw passwords, tokens, cookies, or private user-entered values in value_description.`;
 
 const AGENT_GOAL_SYSTEM = `You are the Navigation Task Agent for a web data collection pipeline.
 
@@ -14,8 +23,9 @@ The agent must navigate the site and return structured JSON with extracted data
 
 Rules:
 - Be specific about what to click, search, filter, or paginate.
-- State the exact JSON shape to return: { "records": [ { column_name: value, ... } ] }
+- State the exact JSON shape to return: { "records": [ { column_name: value, ... } ], "agent_browser_actions": [ ... ] }
 - Include column names from the schema in the goal.
+- Include the browser action reporting contract verbatim enough that the Tinyfish Agent knows it must report replay-oriented actions.
 - For forms: describe fields to fill and how to submit.
 - For detail follow-up: explain how to open each item and which fields to collect.
 - Limit scope (e.g. first 25 rows) to keep runs reliable.
@@ -31,34 +41,45 @@ export async function generateAgentGoal(options: {
   focusFields?: string[];
   memory?: WorkflowMemory;
 }): Promise<AgentGoal> {
-  const columnList = options.spec.columns
-    .map((c) => `${c.name} (${c.type}${c.required ? ", required" : ""})`)
-    .join(", ");
-
   return completeJson({
     label: `agent_goal:${options.triage.final_url}`,
     schema: agentGoalSchema,
-    messages: [
-      { role: "system", content: AGENT_GOAL_SYSTEM },
-      {
-        role: "user",
-        content: JSON.stringify({
-          user_prompt: options.userPrompt,
-          triage_status: options.triage.status,
-          triage_reasoning: options.triage.reasoning,
-          suggested_action: options.triage.suggested_action,
-          page_url: options.triage.final_url,
-          page_title: options.triage.title,
-          row_grain: options.spec.row_grain,
-          columns: columnList,
-          focus_fields: options.focusFields ?? [],
-          extraction_hints: options.spec.extraction_hints,
-          workflow_memory: options.memory
-            ? memoryContextForAgents(options.memory)
-            : undefined,
-          output_shape: { goal: "string", rationale: "string" },
-        }),
-      },
-    ],
+    messages: buildAgentGoalMessages(options),
   });
 }
+
+export function buildAgentGoalMessages(options: {
+  userPrompt: string;
+  spec: DatasetSpec;
+  triage: SourceTriageResult;
+  focusFields?: string[];
+  memory?: WorkflowMemory;
+}): LlmMessage[] {
+  const columnList = options.spec.columns
+    .map((c) => `${c.name} (${c.type}${c.required ? ", required" : ""})`)
+    .join(", ");
+
+  return [
+    { role: "system", content: AGENT_GOAL_SYSTEM },
+    {
+      role: "user",
+      content: JSON.stringify({
+        user_prompt: options.userPrompt,
+        triage_status: options.triage.status,
+        triage_reasoning: options.triage.reasoning,
+        suggested_action: options.triage.suggested_action,
+        page_url: options.triage.final_url,
+        page_title: options.triage.title,
+        row_grain: options.spec.row_grain,
+        columns: columnList,
+        focus_fields: options.focusFields ?? [],
+        extraction_hints: options.spec.extraction_hints,
+        browser_action_reporting_contract: AGENT_BROWSER_ACTION_CONTRACT,
+        workflow_memory: options.memory
+          ? memoryContextForAgents(options.memory)
+          : undefined,
+        output_shape: { goal: "string", rationale: "string" },
+      }),
+    },
+  ];
+}
diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts b/backend/BigSet_Data_Collection_Agent/src/agents/source-policy.ts
@@ -5,7 +5,7 @@ import type {
   SourceTriageResult,
 } from "../models/schemas.js";
 import { scoreDocsUrlForOfficialSource } from "../records/source-urls.js";
-import { getDomain } from "../utils/url.js";
+import { getDomain, normalizeUrl } from "../utils/url.js";
 
 export interface PromptSourceEntity {
   name: string;
@@ -17,6 +17,7 @@ export interface PromptSourcePolicy {
   requiresOfficialSource: boolean;
   entities: PromptSourceEntity[];
   searchPhrases: string[];
+  explicitSourceUrls: string[];
   hint?: string;
 }
 
@@ -55,6 +56,14 @@ function uniqueStrings(values: string[]): string[] {
   return [...new Set(values.map((value) => value.trim()).filter(Boolean))];
 }
 
+function extractPromptSourceUrls(prompt: string): string[] {
+  return uniqueStrings(
+    [...prompt.matchAll(/https?:\/\/[^\s)"'<>]+/gi)].map((match) =>
+      normalizeUrl((match[0] ?? "").replace(/[.,;:!?]+$/g, "")),
+    ),
+  );
+}
+
 function tokenize(value: string): string[] {
   return value
     .toLowerCase()
@@ -157,6 +166,7 @@ export function derivePromptSourcePolicy(prompt: string): PromptSourcePolicy {
   const taskText = taskTextFromPrompt(prompt);
   const entities = extractExplicitEntities(taskText);
   const searchPhrases = searchPhrasesForPrompt(taskText);
+  const explicitSourceUrls = extractPromptSourceUrls(taskText);
   const lower = taskText.toLowerCase();
   const asksForCanonicalSource =
     searchPhrases.length > 0 ||
@@ -182,7 +192,7 @@ export function derivePromptSourcePolicy(prompt: string): PromptSourcePolicy {
       ].join("\n")
     : undefined;
 
-  return { requiresOfficialSource, entities, searchPhrases, hint };
+  return { requiresOfficialSource, entities, searchPhrases, explicitSourceUrls, hint };
 }
 
 export function promptSourceSearchQueries(policy: PromptSourcePolicy): string[] {
@@ -236,6 +246,7 @@ export function urlMatchesPromptSourcePolicy(
   url: string,
   policy: PromptSourcePolicy,
 ): boolean {
+  if (urlMatchesExplicitPromptSource(url, policy)) return true;
   if (!policy.requiresOfficialSource) return true;
   const domain = getDomain(url).toLowerCase();
   if (GENERIC_HOSTED_DOMAIN.test(domain)) {
@@ -246,6 +257,17 @@ export function urlMatchesPromptSourcePolicy(
   );
 }
 
+function urlMatchesExplicitPromptSource(
+  url: string,
+  policy: PromptSourcePolicy,
+): boolean {
+  const normalized = normalizeUrl(url);
+  return policy.explicitSourceUrls.some((sourceUrl) => {
+    const explicit = normalizeUrl(sourceUrl);
+    return normalized === explicit || normalized.startsWith(`${explicit}/`);
+  });
+}
+
 function urlMatchesEntitySourcePolicy(
   url: string,
   entity: PromptSourceEntity,
@@ -361,6 +383,9 @@ export function recordMatchesPromptSourcePolicy(
   if (urls.length === 0) {
     return false;
   }
+  if (urls.some((url) => urlMatchesExplicitPromptSource(url, policy))) {
+    return true;
+  }
 
   return urls.some((url) => urlMatchesEntitySourcePolicy(url, entity, policy));
 }