Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions backend/BigSet_Data_Collection_Agent/src/acquisition/link-follow.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import type { FetchedPage } from "../models/schemas.js";
import type { WorkflowMemory } from "../memory/types.js";
import { domainMemoryBoost } from "../memory/workflow-memory.js";
import { getDomain, normalizeUrl } from "../utils/url.js";

const SKIP_HOST =
/(?:facebook|twitter|x\.com|instagram|youtube|tiktok|pinterest|reddit\.com\/r\/|linkedin\.com\/in\/|accounts\.google|login|signin|signup|register|cookie|privacy|terms|cdn\.|static\.|fonts\.)/i;
const SKIP_EXT = /\.(?:pdf|zip|png|jpe?g|gif|svg|webp|css|js|woff2?|xml|mp4|mp3)(?:\?|$)/i;
const POSITIVE_PATH =
/\/(?:company|companies|startup|startups|portfolio|team|about|careers|jobs|directory|list|batch|founder|org|organization|profile|detail|view)(?:\/|$|\?)/i;
const NEGATIVE_PATH =
/\/(?:tag|tags|category|categories|author|feed|rss|search|wp-admin|wp-content)(?:\/|$|\?)/i;

export interface LinkFollowOptions {
pages: FetchedPage[];
excludeUrls: Set<string>;
focusFields?: string[];
maxTotal: number;
maxPerSource: number;
memory?: WorkflowMemory;
}

function pathTokensFromFields(fields?: string[]): string[] {
if (!fields?.length) return [];
return fields
.flatMap((field) =>
field
.split(/[_\s-]+/)
.map((part) => part.toLowerCase())
.filter((part) => part.length > 3),
)
.slice(0, 12);
}

function scoreLink(
link: string,
sourceDomain: string,
focusTokens: string[],
memory?: WorkflowMemory,
): number {
let score = 0;

try {
const parsed = new URL(link);
const host = parsed.hostname.toLowerCase();
const path = `${parsed.pathname}${parsed.search}`.toLowerCase();

if (SKIP_HOST.test(host) || SKIP_EXT.test(path)) return -1000;
if (NEGATIVE_PATH.test(path)) score -= 2;
if (POSITIVE_PATH.test(path)) score += 4;

const linkDomain = getDomain(link);
if (linkDomain === sourceDomain) score += 3;
else if (linkDomain.endsWith(`.${sourceDomain}`) || sourceDomain.endsWith(`.${linkDomain}`)) {
score += 2;
}

for (const token of focusTokens) {
if (path.includes(token)) score += 2;
}

if (memory) score += domainMemoryBoost(memory, linkDomain);

if (path.length > 120) score -= 1;
if (parsed.hash.length > 1) score -= 1;
} catch {
return -1000;
}

return score;
}

/** Pick outbound links from high-value pages using URL heuristics only. */
export function selectOutboundLinksToFollow(
options: LinkFollowOptions,
): string[] {
const focusTokens = pathTokensFromFields(options.focusFields);
const selected: string[] = [];
const selectedSet = new Set<string>();

const pagesWithLinks = options.pages
.filter((page) => !page.error && page.outbound_links && page.outbound_links.length > 0)
.sort((a, b) => (b.outbound_links?.length ?? 0) - (a.outbound_links?.length ?? 0));

for (const page of pagesWithLinks) {
const sourceUrl = normalizeUrl(page.final_url || page.url);
const sourceDomain = getDomain(sourceUrl);
let perSource = 0;

const ranked = [...(page.outbound_links ?? [])]
.map((link) => ({
link,
score: scoreLink(link, sourceDomain, focusTokens, options.memory),
}))
.filter((item) => item.score > 0)
.sort((a, b) => b.score - a.score);

for (const { link } of ranked) {
if (selected.length >= options.maxTotal) return selected;
if (perSource >= options.maxPerSource) break;

const normalized = normalizeUrl(link);
if (options.excludeUrls.has(normalized)) continue;
if (selectedSet.has(normalized)) continue;
if (normalized === sourceUrl) continue;

selectedSet.add(normalized);
selected.push(link);
perSource += 1;
}
}

return selected;
}
64 changes: 64 additions & 0 deletions backend/BigSet_Data_Collection_Agent/src/agents/agent-goal.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import { completeJson } from "../integrations/openrouter.js";
import {
memoryContextForAgents,
type WorkflowMemory,
} from "../memory/index.js";
import { agentGoalSchema, type AgentGoal } from "../models/schemas.js";
import type { DatasetSpec, SourceTriageResult } from "../models/schemas.js";

const AGENT_GOAL_SYSTEM = `You are the Navigation Task Agent for a web data collection pipeline.

Write a Tinyfish Agent goal: a clear natural-language instruction for browser automation on the given URL.

The agent must navigate the site and return structured JSON with extracted data matching the dataset schema.

Rules:
- Be specific about what to click, search, filter, or paginate.
- State the exact JSON shape to return: { "records": [ { column_name: value, ... } ] }
- Include column names from the schema in the goal.
- For forms: describe fields to fill and how to submit.
- For detail follow-up: explain how to open each item and which fields to collect.
- Limit scope (e.g. first 25 rows) to keep runs reliable.
- Do not invent data; extract only what is visible on the site.
- When workflow_memory is provided, reuse goal patterns from agent_goal_stats_top (high avg_completeness/confidence); avoid domains in domain_stats_weak unless diagnosis says otherwise.
- If latest_diagnosis.prefer_tinyfish_agent or agent_strategy_notes exist, follow them.
- Return ONLY JSON with fields: goal, rationale`;

export async function generateAgentGoal(options: {
userPrompt: string;
spec: DatasetSpec;
triage: SourceTriageResult;
focusFields?: string[];
memory?: WorkflowMemory;
}): Promise<AgentGoal> {
const columnList = options.spec.columns
.map((c) => `${c.name} (${c.type}${c.required ? ", required" : ""})`)
.join(", ");

return completeJson({
label: `agent_goal:${options.triage.final_url}`,
schema: agentGoalSchema,
messages: [
{ role: "system", content: AGENT_GOAL_SYSTEM },
{
role: "user",
content: JSON.stringify({
user_prompt: options.userPrompt,
triage_status: options.triage.status,
triage_reasoning: options.triage.reasoning,
suggested_action: options.triage.suggested_action,
page_url: options.triage.final_url,
page_title: options.triage.title,
row_grain: options.spec.row_grain,
columns: columnList,
focus_fields: options.focusFields ?? [],
extraction_hints: options.spec.extraction_hints,
workflow_memory: options.memory
? memoryContextForAgents(options.memory)
: undefined,
output_shape: { goal: "string", rationale: "string" },
}),
},
],
});
}
94 changes: 94 additions & 0 deletions backend/BigSet_Data_Collection_Agent/src/agents/benchmark-spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import type { ColumnDef, DatasetSpec } from "../models/schemas.js";
import { normalizeSpecColumnOrder } from "./dataset-spec.js";

/** Benchmark harness fields from prompts.json (via env in adapters). */
export interface BenchmarkSpecContext {
promptId?: string;
promptQuality?: string;
persona?: string;
expectedStress?: string;
requiredColumns: string[];
}

export function hasBenchmarkRequiredColumns(
context?: BenchmarkSpecContext,
): context is BenchmarkSpecContext & { requiredColumns: string[] } {
return Boolean(context?.requiredColumns?.length);
}

/** Parse comma-separated column names (CLI flag or benchmark env). */
export function parseRequiredColumns(value: string): string[] {
const columns = value
.split(",")
.map((name) => name.trim())
.filter(Boolean);
if (columns.length === 0) {
throw new Error(
"Required columns must include at least one non-empty column name.",
);
}
return columns;
}

/**
* Ensures every benchmark-required column name exists on the spec as required.
* Types and descriptions come from the dataset-spec LLM when present; otherwise
* minimal placeholders (no per-column name heuristics).
*/
export function mergeSpecWithBenchmarkRequiredColumns(
spec: DatasetSpec,
context: BenchmarkSpecContext,
): DatasetSpec {
const requiredColumns = context.requiredColumns;
const columnsByName = new Map(spec.columns.map((column) => [column.name, column]));

const requiredColumnDefs: ColumnDef[] = requiredColumns.map((name) => {
const existing = columnsByName.get(name);
if (existing) {
return { ...existing, required: true };
}
return {
name,
type: "string",
description: name,
required: true,
};
});

const optionalExtras = spec.columns.filter(
(column) => !requiredColumns.includes(column.name),
);

const columns = [...requiredColumnDefs, ...optionalExtras];
const columnNames = new Set(columns.map((column) => column.name));

const isEntityLikeColumn = (name: string): boolean =>
/(entity|company|organization|business|restaurant|bakery|provider|product|name|title)/i.test(
name,
);

const dedupeKey =
requiredColumns.find(
(name) => columnNames.has(name) && isEntityLikeColumn(name),
) ??
spec.dedupe_keys.find((key) => columnNames.has(key)) ??
requiredColumns.find((name) => columnNames.has(name)) ??
spec.dedupe_keys[0];

const extractionHints = [
spec.extraction_hints,
`Benchmark required columns (use as exact row keys): ${requiredColumns.join(", ")}.`,
context.expectedStress
? `Benchmark stress note: ${context.expectedStress}`
: undefined,
]
.filter(Boolean)
.join("\n");

return normalizeSpecColumnOrder({
...spec,
columns,
dedupe_keys: dedupeKey ? [dedupeKey] : spec.dedupe_keys,
extraction_hints: extractionHints,
});
}
Loading