diff --git a/api/scripts/show-constellation-json.ts b/api/scripts/show-constellation-json.ts new file mode 100644 index 0000000..b3efbbb --- /dev/null +++ b/api/scripts/show-constellation-json.ts @@ -0,0 +1,8 @@ +import { buildConstellation } from "../src/np/constellation"; + +const URI = + process.argv[2] ?? + "https://w3id.org/sciencelive/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8"; + +const c = await buildConstellation(URI); +console.log(JSON.stringify(c, null, 2)); diff --git a/api/scripts/test-constellation.ts b/api/scripts/test-constellation.ts new file mode 100644 index 0000000..f24d290 --- /dev/null +++ b/api/scripts/test-constellation.ts @@ -0,0 +1,47 @@ +/** + * Smoke-test the constellation builder against a real nanopub URI. + * + * Usage: + * npx tsx api/scripts/test-constellation.ts [URI] + * + * Default URI is the Bombus apex CiTO Citation from the weatherxbiodiversity + * replication — the plan's acceptance criterion says we should reach 19/19 + * nanopubs from this entry. + */ +import { buildConstellation } from "../src/np/constellation"; + +const DEFAULT_URI = + "https://w3id.org/sciencelive/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8"; + +const uri = process.argv[2] ?? DEFAULT_URI; + +console.error(`Building constellation from: ${uri}`); +console.error("(this hits KnowledgePixels SPARQL + W3ID TriG resolver)\n"); + +const t0 = Date.now(); +const c = await buildConstellation(uri); +const elapsed = ((Date.now() - t0) / 1000).toFixed(1); + +console.error( + `\nReached ${c.nodeCount} nanopubs / ${c.edgeCount} edges in ${elapsed}s\n`, +); + +const byType = new Map(); +for (const n of c.nodes) { + const key = n.stepType || ""; + byType.set(key, (byType.get(key) ?? 0) + 1); +} + +console.error("Step-type breakdown:"); +for (const [t, count] of [...byType.entries()].sort((a, b) => b[1] - a[1])) { + console.error(` ${String(count).padStart(3)} × ${t}`); +} + +console.error("\nNanopub list:"); +for (const n of c.nodes) { + const id = n.uri.match(/\/np\/(RA[A-Za-z0-9_-]+)/)?.[1] ?? n.uri; + console.error(` ${id} ${n.stepType || "(no template)"}`); +} + +console.error(`\nExternal citations (${c.externalCitations.length}):`); +for (const e of c.externalCitations) console.error(` ${e}`); diff --git a/api/src/index.ts b/api/src/index.ts index a2e719b..06a1de4 100644 --- a/api/src/index.ts +++ b/api/src/index.ts @@ -3,6 +3,7 @@ import { Session, User } from "better-auth"; import { Hono } from "hono"; import { cors } from "hono/cors"; import health from "./health"; +import np from "./np"; import notifications from "./notifications"; import orcid from "./orcid"; import proxy from "./proxy"; @@ -62,6 +63,7 @@ app.on(["POST", "GET"], "/auth/*", (c) => getAuth(c.env).handler(c.req.raw)); // Endpoints that require auth app.route("/notifications", notifications); +app.route("/np", np); app.route("/proxy", proxy); app.route("/signing", signing); diff --git a/api/src/np/constellation.test.ts b/api/src/np/constellation.test.ts new file mode 100644 index 0000000..3b63c3a --- /dev/null +++ b/api/src/np/constellation.test.ts @@ -0,0 +1,1518 @@ +/** + * Tests for the BFS constellation builder. + * + * Mocks global fetch to return canned SPARQL + TriG responses for a synthetic + * 4-node FORRT chain shaped like: + * + * Apex CiTO ──refersTo──▶ Outcome ──(only in TriG)──▶ Claim + * │ │ + * └────────────(only in TriG)──── Quote ◀───refersTo────┘ + * + * Regression targets: + * - Outcome→Claim edge lives only in the TriG body, NOT in the SPARQL + * networkGraph index. The mining step has to surface it (14/19→19/19 + * bug from session #1). + * - Template-definition nanopubs ("Defining a/an …") don't expand outward. + * - maxNodes / depthLimit caps are honoured. + * - SPARQL 503 retry stays correct under the BFS traversal load. + */ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { buildConstellation, classifyStepKind } from "./constellation"; + +// Tiny FORRT-shaped graph used as fixture across the tests. +const APEX = "https://w3id.org/sciencelive/np/RAapex0000000000000000000000000000000000000"; +const OUTCOME = "https://w3id.org/sciencelive/np/RAoutcome000000000000000000000000000000000"; +const CLAIM = "https://w3id.org/sciencelive/np/RAclaim00000000000000000000000000000000000"; +const QUOTE = "https://w3id.org/sciencelive/np/RAquote00000000000000000000000000000000000"; + +const TPL_CITO = "https://w3id.org/np/RAtplCito00000000000000000000000000000000000"; +const TPL_OUTCOME = "https://w3id.org/np/RAtplOutcome00000000000000000000000000000000"; +const TPL_CLAIM = "https://w3id.org/np/RAtplClaim000000000000000000000000000000000000"; +const TPL_QUOTE = "https://w3id.org/np/RAtplQuote000000000000000000000000000000000000"; + +function trigFor(self: string, body: string, templateUri: string): string { + return ` +@prefix this: <${self}> . +@prefix nt: . +@prefix rdfs: . + +sub:assertion { + ${body} +} + +sub:pubinfo { + this: nt:wasCreatedFromTemplate <${templateUri}> . +} +`; +} + +function tplTrigFor(label: string): string { + // Minimal template TriG — the canonical AssertionTemplate-block label is + // what extractTemplateLabel picks up. + return ` +@prefix nt: . +@prefix rdfs: . + +sub:assertion { + sub:assertion a nt:AssertionTemplate; + rdfs:label "${label}"; + nt:hasStatement sub:st01 . +} +`; +} + +function sparqlBindings(rows: { np: string; template?: string }[]): string { + return JSON.stringify({ + results: { + bindings: rows.map((r) => ({ + np: { type: "uri", value: r.np }, + ...(r.template + ? { template: { type: "uri", value: r.template } } + : {}), + })), + }, + }); +} + +const SPARQL_URL = "https://query.knowledgepixels.com/repo/full"; + +/** + * Synthetic KP — returns canned bodies for known URIs. The chain shape: + * - Apex refersToNanopub Outcome (SPARQL says so). + * - Outcome's TriG body references Claim (NOT in SPARQL networkGraph). + * - Claim's TriG body references Quote (NOT in SPARQL networkGraph). + * - All four are real chain steps; templates are not. + */ +function makeMockKp() { + const trigMap: Record = { + [`https://w3id.org/np/${APEX.split("/").pop()}`]: trigFor( + APEX, + `<${APEX}> <${OUTCOME}> .`, + TPL_CITO, + ), + [`https://w3id.org/np/${OUTCOME.split("/").pop()}`]: trigFor( + OUTCOME, + `<${OUTCOME}> <${CLAIM}> .`, + TPL_OUTCOME, + ), + [`https://w3id.org/np/${CLAIM.split("/").pop()}`]: trigFor( + CLAIM, + `<${CLAIM}> <${QUOTE}> .`, + TPL_CLAIM, + ), + [`https://w3id.org/np/${QUOTE.split("/").pop()}`]: trigFor( + QUOTE, + `<${QUOTE}> a .`, + TPL_QUOTE, + ), + [TPL_CITO]: tplTrigFor("Declare citations with CiTO"), + [TPL_OUTCOME]: tplTrigFor( + "Declaring a replication study outcome according to FORRT", + ), + [TPL_CLAIM]: tplTrigFor("Declaring an original claim according to FORRT"), + [TPL_QUOTE]: tplTrigFor( + "Annotating a paper quotation with personal interpretation", + ), + }; + + // Networkgraph edges that KP would materialise — note Outcome→Claim and + // Claim→Quote are MISSING here on purpose. That's the bug class we mine + // from TriG bodies to bridge. + const sparqlEdges: Record = { + [APEX]: { references: [OUTCOME], referencedBy: [] }, + [OUTCOME]: { references: [], referencedBy: [APEX] }, + [CLAIM]: { references: [], referencedBy: [] }, + [QUOTE]: { references: [], referencedBy: [] }, + }; + + return { + fetch: vi.fn(async (url: string | URL | Request, init?: RequestInit) => { + const urlStr = typeof url === "string" ? url : url.toString(); + + // SPARQL query + if (urlStr === SPARQL_URL) { + const body = init?.body; + const queryText = + body instanceof URLSearchParams + ? body.get("query") ?? "" + : String(body ?? ""); + // Extract the URI from the bracketed `` substitution. + const uriMatch = /<([^>]+)>/.exec(queryText); + const uri = uriMatch?.[1] ?? ""; + const edges = sparqlEdges[uri] ?? { references: [], referencedBy: [] }; + // Detect direction by which side of `npa:refersToNanopub` carries + // the URI literal — ` refersToNanopub ?np` is the outgoing + // query; `?np refersToNanopub ` is the incoming query. + const useReferences = /<[^>]+>\s+npa:refersToNanopub\s+\?np/.test( + queryText, + ); + const rows = useReferences ? edges.references : edges.referencedBy; + return new Response( + sparqlBindings(rows.map((np) => ({ np }))), + { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }, + ); + } + + // TriG resolver + const body = trigMap[urlStr]; + if (body) { + return new Response(body, { + status: 200, + headers: { "content-type": "application/trig" }, + }); + } + + return new Response("not found", { status: 404 }); + }), + trigMap, + }; +} + +describe("buildConstellation", () => { + let kp: ReturnType; + + beforeEach(() => { + kp = makeMockKp(); + vi.stubGlobal("fetch", kp.fetch); + }); + + afterEach(() => { + vi.unstubAllGlobals(); + }); + + it("reaches all 4 nodes via SPARQL + TriG mining", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 5, + maxNodes: 80, + concurrency: 2, + }); + + expect(c.nodeCount).toBe(4); + expect(c.nodes.map((n) => n.uri).sort()).toEqual( + [APEX, OUTCOME, CLAIM, QUOTE].sort(), + ); + }); + + it("surfaces TriG-only edges that KP networkGraph misses", async () => { + // Regression for the 14/19→19/19 bug. Outcome→Claim is NOT in the + // sparqlEdges fixture, only in the Outcome's TriG body. If TriG mining + // is removed, Claim and Quote disappear from the result. + const c = await buildConstellation(APEX, { + depthLimit: 5, + maxNodes: 80, + concurrency: 2, + }); + const uris = c.nodes.map((n) => n.uri); + expect(uris).toContain(CLAIM); + expect(uris).toContain(QUOTE); + }); + + it("attaches the correct template-derived stepType to each node", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 5, + maxNodes: 80, + concurrency: 2, + }); + const byUri = new Map(c.nodes.map((n) => [n.uri, n.stepType])); + expect(byUri.get(APEX)).toBe("Declare citations with CiTO"); + expect(byUri.get(OUTCOME)).toBe( + "Declaring a replication study outcome according to FORRT", + ); + expect(byUri.get(CLAIM)).toBe( + "Declaring an original claim according to FORRT", + ); + expect(byUri.get(QUOTE)).toBe( + "Annotating a paper quotation with personal interpretation", + ); + }); + + it("honours maxNodes cap", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 5, + maxNodes: 2, + concurrency: 2, + }); + expect(c.nodeCount).toBeLessThanOrEqual(2); + }); + + it("does not expand past depthLimit", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 1, + maxNodes: 80, + concurrency: 2, + }); + // depth 0 = apex; depth 1 = outcome (via SPARQL) AND its TriG-mined + // edge to Claim. Quote is reachable only from Claim (depth 2), so it + // should not be present. + expect(c.nodes.map((n) => n.uri)).not.toContain(QUOTE); + }); +}); + +// ============================================================================= +// BOUNDARY + EDGE CASES — round 2 +// ============================================================================= + +describe("buildConstellation edge cases", () => { + afterEach(() => { + vi.unstubAllGlobals(); + }); + + it("does not infinite-loop on cycles (A → B → A)", async () => { + const A = "https://w3id.org/sciencelive/np/RAcycleA000000000000000000000000000000000000"; + const B = "https://w3id.org/sciencelive/np/RAcycleB000000000000000000000000000000000000"; + const TPL = "https://w3id.org/np/RAtplCycle00000000000000000000000000000000000"; + + const trigMap: Record = { + [`https://w3id.org/np/${A.split("/").pop()}`]: trigFor( + A, + `<${A}> <${B}> .`, + TPL, + ), + [`https://w3id.org/np/${B.split("/").pop()}`]: trigFor( + B, + `<${B}> <${A}> .`, + TPL, + ), + [TPL]: tplTrigFor("Cycle Template"), + }; + + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) { + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + } + const body = trigMap[u]; + if (body) { + return new Response(body, { + status: 200, + headers: { "content-type": "application/trig" }, + }); + } + return new Response("nf", { status: 404 }); + }), + ); + + const c = await buildConstellation(A, { + depthLimit: 10, + maxNodes: 50, + concurrency: 2, + }); + // Cycle should resolve to exactly 2 nodes — A and B, neither re-processed. + expect(c.nodeCount).toBe(2); + expect(c.nodes.map((n) => n.uri).sort()).toEqual([A, B].sort()); + }); + + it("template-DEFINITION nodes do not expand outward", async () => { + // Entry is a real chain step that references a template-DEFINITION + // nanopub. The template-def MUST be added to nodes (it's a neighbour) + // but its OWN neighbours (which the fixture provides) must NOT appear. + const ENTRY = "https://w3id.org/sciencelive/np/RAentry000000000000000000000000000000000000"; + const TDEF = "https://w3id.org/sciencelive/np/RAtdef00000000000000000000000000000000000000"; + const POISON = "https://w3id.org/sciencelive/np/RApoison0000000000000000000000000000000000"; + const TPL_DEF_LABEL = "https://w3id.org/np/RAtplDef000000000000000000000000000000000000"; + const TPL_NORMAL = "https://w3id.org/np/RAtplNormal0000000000000000000000000000000000"; + + const trigMap: Record = { + [`https://w3id.org/np/${ENTRY.split("/").pop()}`]: trigFor( + ENTRY, + `<${ENTRY}> <${TDEF}> .`, + TPL_NORMAL, + ), + [`https://w3id.org/np/${TDEF.split("/").pop()}`]: trigFor( + TDEF, + `<${TDEF}> <${POISON}> .`, + TPL_DEF_LABEL, + ), + [`https://w3id.org/np/${POISON.split("/").pop()}`]: trigFor( + POISON, + `<${POISON}> a .`, + TPL_NORMAL, + ), + [TPL_DEF_LABEL]: tplTrigFor("Defining an assertion template"), + [TPL_NORMAL]: tplTrigFor("A normal chain step template"), + }; + + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) { + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + } + const body = trigMap[u]; + if (body) { + return new Response(body, { + status: 200, + headers: { "content-type": "application/trig" }, + }); + } + return new Response("nf", { status: 404 }); + }), + ); + + const c = await buildConstellation(ENTRY, { + depthLimit: 5, + maxNodes: 80, + concurrency: 2, + }); + const uris = c.nodes.map((n) => n.uri); + expect(uris).toContain(ENTRY); + expect(uris).toContain(TDEF); + // POISON is reachable only via the template-def's outgoing edge — + // the expand-stop heuristic must block it. + expect(uris).not.toContain(POISON); + }); + + it("returns an empty constellation when the entry URI is unreachable (404)", async () => { + vi.stubGlobal( + "fetch", + vi.fn(async () => new Response("not found", { status: 404 })), + ); + const c = await buildConstellation( + "https://w3id.org/sciencelive/np/RA404404404404404404404404404404404404404404", + { depthLimit: 5, maxNodes: 80, concurrency: 2 }, + ); + expect(c.nodeCount).toBe(0); + expect(c.edges).toEqual([]); + expect(c.externalCitations).toEqual([]); + }); + + it("a TriG that self-references its own URI does not produce a self-edge", async () => { + const SELF = + "https://w3id.org/sciencelive/np/RAselfRef00000000000000000000000000000000000"; + const TPL = "https://w3id.org/np/RAtplSelf00000000000000000000000000000000000"; + + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + if (u === `https://w3id.org/np/${SELF.split("/").pop()}`) + return new Response( + trigFor(SELF, `<${SELF}> <${SELF}> .`, TPL), + { status: 200, headers: { "content-type": "application/trig" } }, + ); + if (u === TPL) + return new Response(tplTrigFor("Self-ref template"), { + status: 200, + headers: { "content-type": "application/trig" }, + }); + return new Response("nf", { status: 404 }); + }), + ); + + const c = await buildConstellation(SELF, { + depthLimit: 5, + maxNodes: 80, + concurrency: 2, + }); + expect(c.nodeCount).toBe(1); + expect(c.edges.filter((e) => e.source === e.target)).toEqual([]); + }); + + it("accumulates external citations across all visited nodes", async () => { + const A = "https://w3id.org/sciencelive/np/RAextcitA00000000000000000000000000000000000"; + const B = "https://w3id.org/sciencelive/np/RAextcitB00000000000000000000000000000000000"; + const TPL = "https://w3id.org/np/RAtplExt000000000000000000000000000000000000"; + + const trigMap: Record = { + [`https://w3id.org/np/${A.split("/").pop()}`]: trigFor( + A, + `<${A}> dct:references , <${B}> .`, + TPL, + ), + [`https://w3id.org/np/${B.split("/").pop()}`]: trigFor( + B, + `<${B}> dct:references .`, + TPL, + ), + [TPL]: tplTrigFor("Citation template"), + }; + + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + const body = trigMap[u]; + if (body) + return new Response(body, { + status: 200, + headers: { "content-type": "application/trig" }, + }); + return new Response("nf", { status: 404 }); + }), + ); + + const c = await buildConstellation(A, { + depthLimit: 5, + maxNodes: 80, + concurrency: 2, + }); + expect(c.externalCitations.sort()).toEqual([ + "https://doi.org/10.1126/science.aax8591", + "https://doi.org/10.5281/zenodo.20113777", + ]); + }); + + it("concurrency=1 produces the same node set as concurrency=8", async () => { + // Re-uses the 4-node FORRT fixture. Run twice with different concurrency + // levels; node sets must match. + const kp1 = makeMockKp(); + vi.stubGlobal("fetch", kp1.fetch); + const c1 = await buildConstellation(APEX, { + depthLimit: 5, + maxNodes: 80, + concurrency: 1, + }); + vi.unstubAllGlobals(); + + const kp2 = makeMockKp(); + vi.stubGlobal("fetch", kp2.fetch); + const c2 = await buildConstellation(APEX, { + depthLimit: 5, + maxNodes: 80, + concurrency: 8, + }); + + expect(new Set(c1.nodes.map((n) => n.uri))).toEqual( + new Set(c2.nodes.map((n) => n.uri)), + ); + }); + + it("recovers from a transient SPARQL 503 mid-traversal", async () => { + // First call to a node's SPARQL fails with 503; retry succeeds; the + // BFS continues and reaches all 4 nodes. + const kp = makeMockKp(); + let sparqlCallCount = 0; + const originalFetch = kp.fetch; + const flakyFetch = vi.fn(async (url: string | URL | Request, init?: RequestInit) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) { + sparqlCallCount++; + if (sparqlCallCount === 2) { + // Inject one transient failure on the second SPARQL call. + return new Response("503", { status: 503 }); + } + } + return originalFetch(url, init); + }); + vi.stubGlobal("fetch", flakyFetch); + + const c = await buildConstellation(APEX, { + depthLimit: 5, + maxNodes: 80, + concurrency: 2, + }); + expect(c.nodeCount).toBe(4); + }); + + it("an empty entry URI string causes the constellation to be empty", async () => { + // buildConstellation is called with a pre-validated URI by the HTTP + // route, but defensive guard: if somehow an empty string slips through, + // we should NOT crash. The TriG fetch will fail and the BFS terminates. + vi.stubGlobal( + "fetch", + vi.fn(async () => new Response("nf", { status: 404 })), + ); + const c = await buildConstellation("", { + depthLimit: 5, + maxNodes: 80, + concurrency: 2, + }); + expect(c.nodeCount).toBe(0); + }); +}); + +// ============================================================================= +// STRUCTURED CHAINS[] ASSEMBLY (Phase A — start-a-new-replication payload) +// ============================================================================= + +describe("classifyStepKind", () => { + it("maps each FORRT template label to its step kind", () => { + const cases: [string, ReturnType][] = [ + [ + "Declaring a replication study outcome according to FORRT", + "outcome", + ], + ["Declaring a replication study design according to FORRT", "study"], + ["Declaring an original claim according to FORRT", "claim"], + [ + "Annotating a paper quotation with personal interpretation", + "quote", + ], + [ + "Expressing a statement about research as an AIDA sentence", + "aida", + ], + ["Declare citations with CiTO", "cito"], + ["Describing research software at summary level - simple", "research-software"], + ["Science Live Research Synthesis", "research-synthesis"], + ]; + for (const [label, expected] of cases) { + expect(classifyStepKind(label)).toBe(expected); + } + }); + + it("returns 'other' for unrecognised labels", () => { + expect(classifyStepKind("Some unrelated template")).toBe("other"); + expect(classifyStepKind("")).toBe("other"); + }); + + it("is case-insensitive", () => { + expect(classifyStepKind("DECLARING A REPLICATION STUDY OUTCOME according to forrt")).toBe( + "outcome", + ); + }); +}); + +describe("buildConstellation chain assembly", () => { + // Full FORRT chain fixture: one chain anchored by an Outcome, with each + // FORRT step shaped like real Bombus chain data so the structured field + // extractors and chain linkage are exercised end-to-end. + const APEX = + "https://w3id.org/sciencelive/np/RAapexCito000000000000000000000000000000"; + const QUOTE = + "https://w3id.org/sciencelive/np/RAquoteAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const AIDA = + "https://w3id.org/sciencelive/np/RAaidaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const CLAIM = + "https://w3id.org/sciencelive/np/RAclaimAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const STUDY = + "https://w3id.org/sciencelive/np/RAstudyAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const OUTCOME = + "https://w3id.org/sciencelive/np/RAoutcomeAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const CITO_OUTCOME = + "https://w3id.org/sciencelive/np/RAcitoOutAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const RS = "https://w3id.org/sciencelive/np/RArsAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const SYNTHESIS = + "https://w3id.org/sciencelive/np/RAsynthesisAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const PAPER_DOI = "https://doi.org/10.1126/science.aax8591"; + const ZENODO_DOI = "https://doi.org/10.5281/zenodo.20113787"; + const AIDA_URI = + "http://purl.org/aida/Projected%20Iberian%20Bombus%20extirpation%20risk%20is%20grid-sensitive."; + + function tplOf(label: string): string { + return `https://w3id.org/np/${label.replace(/[^A-Za-z]/g, "")}T0000000000000000000000`; + } + const TPL_APEX = tplOf("Declare citations with CiTO"); + const TPL_OUTCOME = tplOf("Declaring a replication study outcome according to FORRT"); + const TPL_STUDY = tplOf("Declaring a replication study design according to FORRT"); + const TPL_CLAIM = tplOf("Declaring an original claim according to FORRT"); + const TPL_QUOTE = tplOf("Annotating a paper quotation with personal interpretation"); + const TPL_AIDA = tplOf("Expressing a statement about research as an AIDA sentence"); + const TPL_CITO = tplOf("Declare citations with CiTO"); + const TPL_RS = tplOf("Describing research software at summary level - simple"); + const TPL_SYNTH = tplOf("Science Live Research Synthesis"); + + function tplTrig(label: string): string { + return ` +sub:assertion { + sub:assertion a nt:AssertionTemplate; + rdfs:label "${label}" . +} +`; + } + + function bodyApexCito(): string { + return ` +@prefix this: <${APEX}> . +sub:assertion { + <${APEX}> a ; + <${PAPER_DOI}> . +} +sub:pubinfo { + this: <${TPL_APEX}> . +} +`; + } + + function bodyOutcome(): string { + return ` +@prefix this: <${OUTCOME}> . +sub:assertion { + <${OUTCOME}> a ; + "TEI projection is grid-coupled for low-N species"; + """Per-species rankings are grid-coupled at projection time for species below the cell-count threshold."""; + """Five projection variants tested; main-effects-only at n>=10 yields Spearman rho=+0.97."""; + """Three substrates only; one region."""; + ; + ; + <${ZENODO_DOI}>; + <${STUDY}/some-subject-name>; + "2026-05-09"^^xsd:date . +} +sub:pubinfo { + this: <${TPL_OUTCOME}> . +} +`; + } + + function bodyStudy(): string { + return ` +@prefix this: <${STUDY}> . +sub:assertion { + <${STUDY}> a ; + "Iberian Bombus, three substrates, SSP3-7.0 projection"; + "GLMM with main-effects-only projection extrapolation"; + "Drop interaction terms at projection time only"; + <${CLAIM}/some-subject-name> . +} +sub:pubinfo { + this: <${TPL_STUDY}> . +} +`; + } + + function bodyClaim(): string { + return ` +@prefix this: <${CLAIM}> . +sub:assertion { + <${CLAIM}> a ; + <${AIDA_URI}> . +} +sub:pubinfo { + this: <${TPL_CLAIM}> . +} +`; + } + + function bodyQuote(): string { + return ` +@prefix this: <${QUOTE}> . +sub:assertion { + <${QUOTE}> "Increasing frequency of hotter temperatures predicts species declines."; + <${PAPER_DOI}> . +} +sub:pubinfo { + this: <${TPL_QUOTE}> . +} +`; + } + + function bodyAida(): string { + return ` +@prefix this: <${AIDA}> . +sub:assertion { + <${AIDA_URI}> a . +} +sub:pubinfo { + this: <${TPL_AIDA}> . +} +`; + } + + function bodyCitoOutcome(): string { + return ` +@prefix this: <${CITO_OUTCOME}> . +sub:assertion { + <${OUTCOME}> a ; + <${PAPER_DOI}>; + <${PAPER_DOI}> . +} +sub:pubinfo { + this: <${TPL_CITO}> . +} +`; + } + + function bodyRs(): string { + return ` +@prefix this: <${RS}> . +sub:assertion { + <${RS}> a ; + "weatherxbiodiversity-projection"; + ; + <${CLAIM}> . +} +sub:pubinfo { + this: <${TPL_RS}> . +} +`; + } + + function bodySynthesis(): string { + return ` +@prefix this: <${SYNTHESIS}> . +sub:assertion { + <${SYNTHESIS}> a ; + "TEI mechanism is substrate-robust at fit but grid-coupled at projection"; + """The mechanism resolves into two empirically distinct claims."""; + """Iberian peninsula, three substrates."""; + """Three substrates only."""; + """Filter to species with at least 10 occupied cells."""; + <${OUTCOME}> . +} +sub:pubinfo { + this: <${TPL_SYNTH}> . +} +`; + } + + function makeFullChainKp() { + // Each nanopub's TriG references its parent in the chain, so the BFS + // can walk from APEX downward via TriG-mining (no SPARQL networkGraph + // dependency in the fixture). + const inject = (base: string, refs: string[]) => + base.replace( + "}\nsub:pubinfo", + refs.map((r) => ` <${r}> a .`).join("\n") + "\n}\nsub:pubinfo", + ); + + const trigMap: Record = { + [`https://w3id.org/np/${APEX.split("/").pop()}`]: inject( + bodyApexCito(), + [SYNTHESIS], + ), + [`https://w3id.org/np/${SYNTHESIS.split("/").pop()}`]: inject( + bodySynthesis(), + [OUTCOME], + ), + [`https://w3id.org/np/${OUTCOME.split("/").pop()}`]: inject( + bodyOutcome(), + [STUDY, CITO_OUTCOME, RS], + ), + [`https://w3id.org/np/${STUDY.split("/").pop()}`]: inject(bodyStudy(), [ + CLAIM, + ]), + [`https://w3id.org/np/${CLAIM.split("/").pop()}`]: inject(bodyClaim(), [ + AIDA, + ]), + [`https://w3id.org/np/${AIDA.split("/").pop()}`]: inject(bodyAida(), [ + QUOTE, + ]), + [`https://w3id.org/np/${QUOTE.split("/").pop()}`]: bodyQuote(), + [`https://w3id.org/np/${CITO_OUTCOME.split("/").pop()}`]: + bodyCitoOutcome(), + [`https://w3id.org/np/${RS.split("/").pop()}`]: bodyRs(), + [TPL_APEX]: tplTrig("Declare citations with CiTO"), + [TPL_OUTCOME]: tplTrig( + "Declaring a replication study outcome according to FORRT", + ), + [TPL_STUDY]: tplTrig( + "Declaring a replication study design according to FORRT", + ), + [TPL_CLAIM]: tplTrig("Declaring an original claim according to FORRT"), + [TPL_QUOTE]: tplTrig( + "Annotating a paper quotation with personal interpretation", + ), + [TPL_AIDA]: tplTrig( + "Expressing a statement about research as an AIDA sentence", + ), + [TPL_CITO]: tplTrig("Declare citations with CiTO"), + [TPL_RS]: tplTrig( + "Describing research software at summary level - simple", + ), + [TPL_SYNTH]: tplTrig("Science Live Research Synthesis"), + }; + + return vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + const body = trigMap[u]; + if (body) + return new Response(body, { + status: 200, + headers: { "content-type": "application/trig" }, + }); + return new Response("nf", { status: 404 }); + }); + } + + beforeEach(() => { + vi.stubGlobal("fetch", makeFullChainKp()); + }); + afterEach(() => vi.unstubAllGlobals()); + + it("identifies the apex CiTO from the entry URI", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 6, + maxNodes: 80, + concurrency: 2, + }); + expect(c.apexCito).not.toBeNull(); + expect(c.apexCito?.uri).toBe(APEX); + expect(c.apexCito?.relations).toEqual(["qualifies"]); + expect(c.apexCito?.citedTargets).toContain(PAPER_DOI); + }); + + it("identifies the Research Synthesis", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 6, + maxNodes: 80, + concurrency: 2, + }); + expect(c.researchSynthesis).not.toBeNull(); + expect(c.researchSynthesis?.uri).toBe(SYNTHESIS); + expect(c.researchSynthesis?.synthesis).toMatch( + /resolves into two empirically distinct claims/, + ); + expect(c.researchSynthesis?.recommendations).toMatch( + /at least 10 occupied cells/, + ); + }); + + it("picks the paper DOI (deprioritising Zenodo artefact DOIs)", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 6, + maxNodes: 80, + concurrency: 2, + }); + expect(c.paperDoi).toBe(PAPER_DOI); + }); + + it("produces a chain with every FORRT step in order", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 6, + maxNodes: 80, + concurrency: 2, + }); + expect(c.chains).toHaveLength(1); + const steps = c.chains[0].steps.map((s) => s.step); + expect(steps).toEqual([ + "Quote", + "AIDA", + "Claim", + "Study", + "Outcome", + "CiTO", + "ResearchSoftware", + ]); + }); + + it("carries the FORRT Outcome verdict + confidence on the chain", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 6, + maxNodes: 80, + concurrency: 2, + }); + expect(c.chains[0].outcomeVerdict).toBe("PartiallySupported"); + expect(c.chains[0].outcomeConfidence).toBe("HighConfidence"); + }); + + it("carries the outcome-level CiTO relations on the chain", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 6, + maxNodes: 80, + concurrency: 2, + }); + expect(c.chains[0].citoRelations.sort()).toEqual([ + "extends", + "qualifies", + ]); + }); + + it("carries the Outcome's Zenodo repository on the chain", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 6, + maxNodes: 80, + concurrency: 2, + }); + const outcomeStep = c.chains[0].steps.find((s) => s.step === "Outcome"); + expect(outcomeStep?.repository).toBe(ZENODO_DOI); + }); + + it("carries the Research Software's GitHub URL on the chain", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 6, + maxNodes: 80, + concurrency: 2, + }); + const rsStep = c.chains[0].steps.find((s) => s.step === "ResearchSoftware"); + expect(rsStep?.repository).toBe( + "https://github.com/annefou/weatherxbiodiversity-projection", + ); + }); + + it("carries the Quote's verbatim text on the chain", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 6, + maxNodes: 80, + concurrency: 2, + }); + const quoteStep = c.chains[0].steps.find((s) => s.step === "Quote"); + expect(quoteStep?.text).toBe( + "Increasing frequency of hotter temperatures predicts species declines.", + ); + }); + + it("carries the Claim's type on the chain", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 6, + maxNodes: 80, + concurrency: 2, + }); + const claimStep = c.chains[0].steps.find((s) => s.step === "Claim"); + expect(claimStep?.type).toBe("model_performance"); + }); + + it("carries the Study's scope and method on the chain", async () => { + const c = await buildConstellation(APEX, { + depthLimit: 6, + maxNodes: 80, + concurrency: 2, + }); + const studyStep = c.chains[0].steps.find((s) => s.step === "Study"); + expect(studyStep?.scope).toMatch(/Iberian Bombus/); + expect(studyStep?.method).toMatch(/main-effects-only/); + }); + + it("returns empty chains[] when there is no Outcome in the constellation", async () => { + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + return new Response("nf", { status: 404 }); + }), + ); + const c = await buildConstellation(APEX, { + depthLimit: 5, + maxNodes: 80, + concurrency: 2, + }); + expect(c.chains).toEqual([]); + }); +}); + +// ============================================================================= +// ROUND 3 — chain assembly adversarial edge cases +// ============================================================================= + +describe("buildConstellation — apex-is-Outcome (entry not a CiTO)", () => { + // User passes an Outcome URI as the entry instead of an apex CiTO. The + // endpoint must still return a usable chain even though apexCito is null. + const OUTCOME_AS_ENTRY = + "https://w3id.org/sciencelive/np/RAoutcomeEntry00000000000000000000000000"; + const STUDY_E = "https://w3id.org/sciencelive/np/RAstudyE000000000000000000000000000000"; + const CLAIM_E = "https://w3id.org/sciencelive/np/RAclaimE000000000000000000000000000000"; + const TPL_OUT = "https://w3id.org/np/RAtplOutE0000000000000000000000000000000"; + const TPL_STU = "https://w3id.org/np/RAtplStuE0000000000000000000000000000000"; + const TPL_CLA = "https://w3id.org/np/RAtplClaE0000000000000000000000000000000"; + + it("returns apexCito=null when the entry isn't a CiTO Citation", async () => { + const trigMap: Record = { + [`https://w3id.org/np/${OUTCOME_AS_ENTRY.split("/").pop()}`]: ` +@prefix this: <${OUTCOME_AS_ENTRY}> . +sub:assertion { + <${OUTCOME_AS_ENTRY}> a ; + ; + <${STUDY_E}> . + <${STUDY_E}> a . +} +sub:pubinfo { + this: <${TPL_OUT}> . +} +`, + [`https://w3id.org/np/${STUDY_E.split("/").pop()}`]: ` +@prefix this: <${STUDY_E}> . +sub:assertion { + <${STUDY_E}> a ; + <${CLAIM_E}> . + <${CLAIM_E}> a . +} +sub:pubinfo { + this: <${TPL_STU}> . +} +`, + [`https://w3id.org/np/${CLAIM_E.split("/").pop()}`]: ` +@prefix this: <${CLAIM_E}> . +sub:assertion { + <${CLAIM_E}> a . +} +sub:pubinfo { + this: <${TPL_CLA}> . +} +`, + [TPL_OUT]: `sub:assertion { sub:assertion a nt:AssertionTemplate; rdfs:label "Declaring a replication study outcome according to FORRT" . }`, + [TPL_STU]: `sub:assertion { sub:assertion a nt:AssertionTemplate; rdfs:label "Declaring a replication study design according to FORRT" . }`, + [TPL_CLA]: `sub:assertion { sub:assertion a nt:AssertionTemplate; rdfs:label "Declaring an original claim according to FORRT" . }`, + }; + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + const body = trigMap[u]; + if (body) + return new Response(body, { + status: 200, + headers: { "content-type": "application/trig" }, + }); + return new Response("nf", { status: 404 }); + }), + ); + const c = await buildConstellation(OUTCOME_AS_ENTRY, { + depthLimit: 6, + maxNodes: 80, + concurrency: 2, + }); + expect(c.apexCito).toBeNull(); + // The chain should still assemble — Outcome, Study, Claim are reachable. + expect(c.chains).toHaveLength(1); + const steps = c.chains[0].steps.map((s) => s.step); + expect(steps).toContain("Outcome"); + expect(steps).toContain("Study"); + expect(steps).toContain("Claim"); + }); + afterEach(() => vi.unstubAllGlobals()); +}); + +describe("buildConstellation — paper DOI heuristic", () => { + const APEX = + "https://w3id.org/sciencelive/np/RAapex00000000000000000000000000000000000"; + const TPL = "https://w3id.org/np/RAtplApex000000000000000000000000000000000"; + + function setupWithDois(dois: string[]): void { + const citoLines = dois + .map((d) => ` <${d}>;`) + .join("\n"); + const trigMap: Record = { + [`https://w3id.org/np/${APEX.split("/").pop()}`]: ` +@prefix this: <${APEX}> . +sub:assertion { + <${APEX}> a ; +${citoLines} + <${dois[dois.length - 1]}> . +} +sub:pubinfo { + this: <${TPL}> . +} +`, + [TPL]: `sub:assertion { sub:assertion a nt:AssertionTemplate; rdfs:label "Declare citations with CiTO" . }`, + }; + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + const body = trigMap[u]; + if (body) + return new Response(body, { + status: 200, + headers: { "content-type": "application/trig" }, + }); + return new Response("nf", { status: 404 }); + }), + ); + } + afterEach(() => vi.unstubAllGlobals()); + + it("deprioritises Zenodo DOIs even when they're more frequent than the paper DOI", async () => { + setupWithDois([ + "https://doi.org/10.5281/zenodo.1", + "https://doi.org/10.5281/zenodo.2", + "https://doi.org/10.5281/zenodo.3", + "https://doi.org/10.1126/science.aax8591", + ]); + const c = await buildConstellation(APEX, { + depthLimit: 2, + maxNodes: 20, + concurrency: 2, + }); + expect(c.paperDoi).toBe("https://doi.org/10.1126/science.aax8591"); + }); + + it("picks the most-frequent non-Zenodo DOI when several are present", async () => { + setupWithDois([ + "https://doi.org/10.1/winner", + "https://doi.org/10.1/winner", + "https://doi.org/10.1/winner", + "https://doi.org/10.1/loser", + ]); + const c = await buildConstellation(APEX, { + depthLimit: 2, + maxNodes: 20, + concurrency: 2, + }); + expect(c.paperDoi).toBe("https://doi.org/10.1/winner"); + }); + + it("returns paperDoi='' when the constellation has NO DOIs", async () => { + const trigMap: Record = { + [`https://w3id.org/np/${APEX.split("/").pop()}`]: ` +@prefix this: <${APEX}> . +sub:assertion { + <${APEX}> a . +} +sub:pubinfo { + this: <${TPL}> . +} +`, + [TPL]: `sub:assertion { sub:assertion a nt:AssertionTemplate; rdfs:label "Declare citations with CiTO" . }`, + }; + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + const body = trigMap[u]; + if (body) + return new Response(body, { + status: 200, + headers: { "content-type": "application/trig" }, + }); + return new Response("nf", { status: 404 }); + }), + ); + const c = await buildConstellation(APEX, { + depthLimit: 2, + maxNodes: 20, + concurrency: 2, + }); + expect(c.paperDoi).toBe(""); + }); + + it("falls back to a Zenodo DOI when there are NO non-Zenodo DOIs", async () => { + setupWithDois([ + "https://doi.org/10.5281/zenodo.solo", + "https://doi.org/10.5281/zenodo.solo", + ]); + const c = await buildConstellation(APEX, { + depthLimit: 2, + maxNodes: 20, + concurrency: 2, + }); + expect(c.paperDoi).toBe("https://doi.org/10.5281/zenodo.solo"); + }); +}); + +describe("buildConstellation — pathological cases", () => { + it("survives templateUri === selfUri (template self-reference)", async () => { + // A nanopub whose `wasCreatedFromTemplate` points at itself. Should not + // infinite-loop and should produce some result (likely flat-noded). + const SELF = + "https://w3id.org/sciencelive/np/RAself00000000000000000000000000000000000"; + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + if ( + u === `https://w3id.org/np/${SELF.split("/").pop()}` || + u === SELF + ) { + return new Response( + ` +@prefix this: <${SELF}> . +sub:assertion { + <${SELF}> a . +} +sub:pubinfo { + this: <${SELF}> . +} +`, + { + status: 200, + headers: { "content-type": "application/trig" }, + }, + ); + } + return new Response("nf", { status: 404 }); + }), + ); + const c = await buildConstellation(SELF, { + depthLimit: 3, + maxNodes: 10, + concurrency: 2, + }); + expect(c.nodeCount).toBeGreaterThanOrEqual(1); + expect(c.chains).toEqual([]); + }); + + it("survives a Synthesis with no isSupportedBy entries", async () => { + const SYN = + "https://w3id.org/sciencelive/np/RAsynLone000000000000000000000000000000"; + const TPL = "https://w3id.org/np/RAtplSynL0000000000000000000000000000000"; + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + if (u === `https://w3id.org/np/${SYN.split("/").pop()}`) + return new Response( + ` +@prefix this: <${SYN}> . +sub:assertion { + <${SYN}> a ; + "Lonely synthesis." . +} +sub:pubinfo { + this: <${TPL}> . +} +`, + { + status: 200, + headers: { "content-type": "application/trig" }, + }, + ); + if (u === TPL) + return new Response( + `sub:assertion { sub:assertion a nt:AssertionTemplate; rdfs:label "Science Live Research Synthesis" . }`, + { + status: 200, + headers: { "content-type": "application/trig" }, + }, + ); + return new Response("nf", { status: 404 }); + }), + ); + const c = await buildConstellation(SYN, { + depthLimit: 2, + maxNodes: 10, + concurrency: 2, + }); + expect(c.researchSynthesis).not.toBeNull(); + expect(c.researchSynthesis?.synthesis).toBe("Lonely synthesis."); + expect(c.chains).toEqual([]); // no Outcome → no chain + }); +}); + +afterEach(() => vi.unstubAllGlobals()); + +// ============================================================================= +// ROUND 3 — adversarial fixtures (malformed / unfamiliar TriG) +// ============================================================================= + +describe("buildConstellation — adversarial TriGs", () => { + afterEach(() => vi.unstubAllGlobals()); + + it("survives invalid Turtle syntax in a nanopub TriG", async () => { + const URI = + "https://w3id.org/sciencelive/np/RAgarbage00000000000000000000000000000000"; + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + if (u === `https://w3id.org/np/${URI.split("/").pop()}`) + return new Response( + "complete garbage {{{ not turtle <<<>>> mismatched", + { + status: 200, + headers: { "content-type": "application/trig" }, + }, + ); + return new Response("nf", { status: 404 }); + }), + ); + const c = await buildConstellation(URI, { + depthLimit: 2, + maxNodes: 10, + concurrency: 2, + }); + expect(c.nodeCount).toBe(1); + expect(c.chains).toEqual([]); + }); + + it("survives a TriG using unknown prefix declarations", async () => { + const URI = + "https://w3id.org/sciencelive/np/RAunknown00000000000000000000000000000000"; + const TPL = "https://w3id.org/np/RAtplUnk00000000000000000000000000000000"; + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + if (u === `https://w3id.org/np/${URI.split("/").pop()}`) + return new Response( + ` +@prefix wat: . +sub:assertion { + <${URI}> wat:type wat:Thing; + wat:hasFoo "some literal foo content here" . +} +sub:pubinfo { + <${URI}> <${TPL}> . +} +`, + { + status: 200, + headers: { "content-type": "application/trig" }, + }, + ); + if (u === TPL) + return new Response( + `sub:assertion { sub:assertion a nt:AssertionTemplate; rdfs:label "Unknown template" . }`, + { + status: 200, + headers: { "content-type": "application/trig" }, + }, + ); + return new Response("nf", { status: 404 }); + }), + ); + const c = await buildConstellation(URI, { + depthLimit: 2, + maxNodes: 10, + concurrency: 2, + }); + expect(c.nodeCount).toBe(1); + const node = c.nodes[0]; + expect(node.stepType).toBe("Unknown template"); + expect(node.stepKind).toBe("other"); + expect(c.chains).toEqual([]); + }); + + it("survives a TriG with extreme nesting of triple-quoted strings", async () => { + const URI = + "https://w3id.org/sciencelive/np/RAnestedTriple000000000000000000000000"; + const TPL = "https://w3id.org/np/RAtplNest000000000000000000000000000000000"; + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + if (u === `https://w3id.org/np/${URI.split("/").pop()}`) + return new Response( + ` +@prefix this: <${URI}> . +sub:assertion { + <${URI}> """Line 1. +Line 2 with "embedded" quote. +Line 3 with .dots. and ;semicolons; that would terminate the segment if not quoted. +End of multi-line literal."""; + . +} +sub:pubinfo { + this: <${TPL}> . +} +`, + { + status: 200, + headers: { "content-type": "application/trig" }, + }, + ); + if (u === TPL) + return new Response( + `sub:assertion { sub:assertion a nt:AssertionTemplate; rdfs:label "Declaring a replication study outcome according to FORRT" . }`, + { + status: 200, + headers: { "content-type": "application/trig" }, + }, + ); + return new Response("nf", { status: 404 }); + }), + ); + const c = await buildConstellation(URI, { + depthLimit: 2, + maxNodes: 10, + concurrency: 2, + }); + expect(c.nodes[0].outcome?.conclusion).toMatch(/Line 1\./); + expect(c.nodes[0].outcome?.conclusion).toMatch(/End of multi-line literal/); + expect(c.nodes[0].outcome?.validationStatus).toBe("Validated"); + }); + + it("filters out RSA-signature base64 blobs from plainTextExcerpts", async () => { + const URI = + "https://w3id.org/sciencelive/np/RAbase64Sig00000000000000000000000000000"; + const TPL = "https://w3id.org/np/RAtplB64Sig00000000000000000000000000000"; + const sig = "MIICIjANBgkqhkiG9w0BAQEFAA" + "A".repeat(400); + vi.stubGlobal( + "fetch", + vi.fn(async (url: string | URL | Request) => { + const u = typeof url === "string" ? url : url.toString(); + if (u === SPARQL_URL) + return new Response(sparqlBindings([]), { + status: 200, + headers: { "content-type": "application/sparql-results+json" }, + }); + if (u === `https://w3id.org/np/${URI.split("/").pop()}`) + return new Response( + ` +@prefix this: <${URI}> . +sub:pubinfo { + this: "${sig}"; + rdfs:label "Real label" . +} +sub:assertion { + <${URI}> rdfs:label "real human-readable content with spaces" . +} +`, + { + status: 200, + headers: { "content-type": "application/trig" }, + }, + ); + if (u === TPL) + return new Response( + `sub:assertion { sub:assertion a nt:AssertionTemplate; rdfs:label "X" . }`, + { + status: 200, + headers: { "content-type": "application/trig" }, + }, + ); + return new Response("nf", { status: 404 }); + }), + ); + const c = await buildConstellation(URI, { + depthLimit: 2, + maxNodes: 10, + concurrency: 2, + }); + expect(c.nodes[0].plainTextExcerpts).not.toContain(sig); + expect(c.nodes[0].plainTextExcerpts).toContain( + "real human-readable content with spaces", + ); + }); +}); diff --git a/api/src/np/constellation.ts b/api/src/np/constellation.ts new file mode 100644 index 0000000..a3a25c0 --- /dev/null +++ b/api/src/np/constellation.ts @@ -0,0 +1,662 @@ +import { bindUri, REFERENCES_FROM, REFERENCES_TO } from "./queries"; +import { executeSparql, fetchTrig } from "./sparql"; +import { + canonicalNanopubUri, + type AidaFields, + type CitoFields, + type ClaimFields, + extractAidaFields, + extractCitoFields, + extractClaimFields, + extractDois, + extractExcerpts, + extractGithubUrls, + extractNanopubMeta, + extractNanopubUris, + extractOrcids, + extractOutcomeFields, + extractQuoteFields, + extractResearchSoftwareFields, + extractResearchSynthesisFields, + extractStudyFields, + extractTemplateLabel, + isTemplateDefinitionLabel, + type OutcomeFields, + type QuoteFields, + type ResearchSoftwareFields, + type ResearchSynthesisFields, + type StudyFields, +} from "./trig"; + +/** + * Categorical kind of each chain step, inferred from the template label. + * "other" covers template-definition nanopubs and anything we don't yet + * recognise (we still include them in `nodes[]` for debugging). + */ +export type StepKind = + | "quote" + | "aida" + | "claim" + | "study" + | "outcome" + | "cito" + | "research-software" + | "research-synthesis" + | "other"; + +export type ConstellationNode = { + uri: string; + stepKind: StepKind; + stepType: string; + templateUri: string; + label: string; + date: string; + creators: string[]; + authorsOrcid: string[]; + plainTextExcerpts: string[]; + // Per-template structured fields — only the matching one is populated. + outcome?: OutcomeFields; + study?: StudyFields; + claim?: ClaimFields; + quote?: QuoteFields; + aida?: AidaFields; + cito?: CitoFields; + researchSoftware?: ResearchSoftwareFields; + researchSynthesis?: ResearchSynthesisFields; + /** Every GitHub URL found in this nanopub's TriG. */ + githubUrls: string[]; +}; + +export type ConstellationEdge = { + source: string; + target: string; + relation: "refersTo"; +}; + +export type ChainStep = { + step: "Quote" | "AIDA" | "Claim" | "Study" | "Outcome" | "CiTO" | "ResearchSoftware"; + uri: string; + label?: string; + text?: string; + type?: string; + scope?: string; + method?: string; + deviations?: string; + verdict?: string; + confidence?: string; + conclusion?: string; + evidence?: string; + limitations?: string; + repository?: string; + zenodoDoi?: string; + relations?: string[]; + targets?: string[]; +}; + +export type Chain = { + id: string; + outcomeUri: string; + outcomeVerdict: string; + outcomeConfidence: string; + citoRelations: string[]; + steps: ChainStep[]; +}; + +export type ApexCito = { + uri: string; + relations: string[]; + citedTargets: string[]; +}; + +export type ResearchSynthesisSummary = { + uri: string; + label: string; + synthesis: string; + conditions: string; + limitations: string; + recommendations: string; +}; + +export type Constellation = { + entry: string; + /** The primary cited paper DOI shared across the chain, if discoverable. */ + paperDoi: string; + /** Top-level CiTO Citation nanopub at the apex of the constellation. */ + apexCito: ApexCito | null; + /** Top-level Research Synthesis nanopub, if one is present. */ + researchSynthesis: ResearchSynthesisSummary | null; + /** One chain per FORRT Outcome — Quote → AIDA → Claim → Study → Outcome + * → CiTO + ResearchSoftware. */ + chains: Chain[]; + // Backwards-compatible flat fields — useful for debugging and for clients + // that want raw graph data. + nodeCount: number; + edgeCount: number; + sparqlEndpoint: string; + nodes: ConstellationNode[]; + edges: ConstellationEdge[]; + externalCitations: string[]; +}; + +type TraversalOptions = { + depthLimit: number; + maxNodes: number; + /** Max concurrent processNode() calls per BFS level. KP rate-limits ~503 + * under heavier fan-out; 2 is a safe default that finishes a 19-node + * chain in ~15-20s. */ + concurrency: number; + signal?: AbortSignal; +}; + +const DEFAULT_OPTIONS: TraversalOptions = { + depthLimit: 5, + maxNodes: 80, + concurrency: 2, +}; + +/** + * BFS the citation graph from `entryUri` using bidirectional SPARQL discovery + * (incoming + outgoing references) and per-nanopub TriG fetching for content + * extraction. Mirrors the Python `import-nanopub-chain.py walk()` function. + */ +export async function buildConstellation( + entryUri: string, + optsIn: Partial = {}, +): Promise { + const opts = { ...DEFAULT_OPTIONS, ...optsIn }; + const { depthLimit, maxNodes, concurrency, signal } = opts; + + const templateLabelCache = new Map(); + const nodes = new Map(); + const edges: ConstellationEdge[] = []; + const externals = new Set(); + const visited = new Set(); + + let frontier: { uri: string; depth: number }[] = [ + { uri: entryUri, depth: 0 }, + ]; + + while (frontier.length > 0 && nodes.size < maxNodes) { + const level = frontier; + frontier = []; + const toProcess = level.filter((n) => !visited.has(n.uri)); + for (const n of toProcess) visited.add(n.uri); + if (toProcess.length === 0) continue; + + const results = await runWithConcurrency( + toProcess, + concurrency, + (n) => processNode(n, templateLabelCache, signal), + ); + + for (const r of results) { + if (!r) continue; + nodes.set(r.node.uri, r.node); + for (const d of r.dois) externals.add(d); + + if (isTemplateDefinitionLabel(r.node.stepType)) continue; + if (r.depth >= depthLimit) continue; + + for (const neighbour of r.neighbours) { + if (neighbour === r.node.templateUri) continue; + edges.push({ + source: r.node.uri, + target: neighbour, + relation: "refersTo", + }); + if (!visited.has(neighbour) && nodes.size < maxNodes) { + frontier.push({ uri: neighbour, depth: r.depth + 1 }); + } + } + } + } + + const nodeList = [...nodes.values()]; + const { chains, apexCito, researchSynthesis, paperDoi } = assembleChains( + nodeList, + entryUri, + ); + + return { + entry: entryUri, + paperDoi, + apexCito, + researchSynthesis, + chains, + nodeCount: nodes.size, + edgeCount: edges.length, + sparqlEndpoint: "https://query.knowledgepixels.com/repo/full", + nodes: nodeList, + edges, + externalCitations: [...externals].sort(), + }; +} + +// ============================================================================= +// Per-node processing +// ============================================================================= + +type ProcessedNode = { + depth: number; + node: ConstellationNode; + neighbours: string[]; + dois: string[]; +}; + +async function processNode( + { uri, depth }: { uri: string; depth: number }, + templateLabelCache: Map, + signal?: AbortSignal, +): Promise { + let trig: string; + try { + trig = await fetchTrig(uri, signal); + } catch { + return null; + } + + const templateUri = extractTemplateUriFromTrig(trig); + const stepType = templateUri + ? await resolveTemplateLabel(templateUri, templateLabelCache, signal) + : ""; + + const stepKind = classifyStepKind(stepType); + const meta = extractNanopubMeta(trig); + + const node: ConstellationNode = { + uri, + stepKind, + stepType, + templateUri: templateUri ?? "", + label: meta.label, + date: meta.date, + creators: meta.creators, + authorsOrcid: extractOrcids(trig), + plainTextExcerpts: extractExcerpts(trig), + githubUrls: extractGithubUrls(trig), + }; + + // Attach the per-template structured fields. We always attach when the + // step kind matches, even if the extraction returned mostly empty fields — + // that's the contract for callers checking which kind a node is. + switch (stepKind) { + case "outcome": + node.outcome = extractOutcomeFields(trig); + break; + case "study": + node.study = extractStudyFields(trig); + break; + case "claim": + node.claim = extractClaimFields(trig); + break; + case "quote": + node.quote = extractQuoteFields(trig); + break; + case "aida": + node.aida = extractAidaFields(trig); + break; + case "cito": + node.cito = extractCitoFields(trig, { + selfUri: uri, + templateUri: templateUri ?? "", + }); + break; + case "research-software": + node.researchSoftware = extractResearchSoftwareFields(trig); + break; + case "research-synthesis": + node.researchSynthesis = extractResearchSynthesisFields(trig); + break; + } + + let sparqlNeighbours: string[] = []; + try { + sparqlNeighbours = await discoverNeighbours(uri, signal); + } catch { + sparqlNeighbours = []; + } + + const merged = new Set(sparqlNeighbours); + for (const u of extractNanopubUris(trig)) { + if (u !== uri) merged.add(u); + } + const neighbours = [...merged]; + + return { depth, node, neighbours, dois: extractDois(trig) }; +} + +async function discoverNeighbours( + uri: string, + signal?: AbortSignal, +): Promise { + // Serialise the two SPARQL hits — KP's nginx returns intermittent 503s + // under concurrent load, and the executeSparql retry helper handles + // transient failures but doesn't reduce parallel pressure. + const incoming = await executeSparql(bindUri(REFERENCES_TO, uri), signal); + const outgoing = await executeSparql(bindUri(REFERENCES_FROM, uri), signal); + const out = new Set(); + for (const row of [...incoming, ...outgoing]) { + const canon = canonicalNanopubUri(row.np ?? ""); + if (canon && canon !== uri) out.add(canon); + } + return [...out]; +} + +async function resolveTemplateLabel( + templateUri: string, + cache: Map, + signal?: AbortSignal, +): Promise { + const cached = cache.get(templateUri); + if (cached !== undefined) return cached; + try { + const trig = await fetchTrig(templateUri, signal); + const label = extractTemplateLabel(trig); + cache.set(templateUri, label); + return label; + } catch { + cache.set(templateUri, ""); + return ""; + } +} + +function extractTemplateUriFromTrig(trig: string): string | null { + const m = + /(?:nt:wasCreatedFromTemplate|)\s+<([^>]+)>/.exec( + trig, + ); + return m ? m[1] : null; +} + +// ============================================================================= +// Step-kind classification +// ============================================================================= + +export function classifyStepKind(stepType: string): StepKind { + const s = stepType.toLowerCase(); + if (s.includes("research synthesis")) return "research-synthesis"; + if (s.includes("research software")) return "research-software"; + if (s.includes("replication study outcome")) return "outcome"; + if (s.includes("replication study design")) return "study"; + if (s.includes("original claim")) return "claim"; + if (s.includes("paper quotation")) return "quote"; + if (s.includes("aida sentence")) return "aida"; + if (s.includes("citations with cito") || s.includes("cito citation")) + return "cito"; + return "other"; +} + +// ============================================================================= +// Chain assembly +// ============================================================================= + +type AssemblyResult = { + chains: Chain[]; + apexCito: ApexCito | null; + researchSynthesis: ResearchSynthesisSummary | null; + paperDoi: string; +}; + +function assembleChains( + nodes: ConstellationNode[], + entryUri: string, +): AssemblyResult { + const byUri = new Map(nodes.map((n) => [n.uri, n])); + const byKind = new Map(); + for (const n of nodes) { + const list = byKind.get(n.stepKind) ?? []; + list.push(n); + byKind.set(n.stepKind, list); + } + + // Apex CiTO — the entry-URI node if it's a CiTO, otherwise the first CiTO + // whose cited targets include the paper DOI we'll discover next. + const entryNode = byUri.get(entryUri); + let apexCito: ApexCito | null = null; + if (entryNode && entryNode.stepKind === "cito" && entryNode.cito) { + apexCito = { + uri: entryNode.uri, + relations: entryNode.cito.relations, + citedTargets: entryNode.cito.citedTargets, + }; + } + + // Research Synthesis — the first node of that kind. Synthesis is unique + // at the apex of a multi-chain constellation; if there are multiple, the + // first one is used. + const synthesisNode = (byKind.get("research-synthesis") ?? [])[0]; + const researchSynthesis: ResearchSynthesisSummary | null = + synthesisNode && synthesisNode.researchSynthesis + ? { + uri: synthesisNode.uri, + label: synthesisNode.label, + synthesis: synthesisNode.researchSynthesis.synthesisDescription, + conditions: synthesisNode.researchSynthesis.conditions, + limitations: synthesisNode.researchSynthesis.limitations, + recommendations: synthesisNode.researchSynthesis.recommendations, + } + : null; + + // Paper DOI — the most common DOI cited across the constellation's CiTO + // and Quote nodes, biased toward those that aren't Zenodo (artefact DOIs). + const paperDoi = findPrimaryPaperDoi(nodes); + + // Build a chain per Outcome. + const outcomes = byKind.get("outcome") ?? []; + const claims = byKind.get("claim") ?? []; + const studies = byKind.get("study") ?? []; + const quotes = byKind.get("quote") ?? []; + const aidas = byKind.get("aida") ?? []; + const citos = byKind.get("cito") ?? []; + const researchSoftwares = byKind.get("research-software") ?? []; + + const chains: Chain[] = []; + for (const outcome of outcomes) { + const out = outcome.outcome; + if (!out) continue; + + // Study via explicit isOutcomeOf. The predicate value sometimes carries + // a trailing assertion-subject path like `…/soroye-tei-mechanism`; we + // canonicalise to the bare nanopub URI before the lookup. + const studyUri = canonicalNanopubUri(out.studyUri) ?? out.studyUri; + const study = byUri.get(studyUri) ?? findRelated(outcome, studies); + const claimUri = study?.study?.claimUri + ? canonicalNanopubUri(study.study.claimUri) ?? study.study.claimUri + : ""; + const claim = byUri.get(claimUri) ?? findRelated(outcome, claims); + + // AIDA via the aida URI the Claim references — match the AIDA node + // whose decoded sentence equals the Claim's aidaStatement decoded. + const aida = findAidaForClaim(claim, aidas) ?? findRelated(outcome, aidas); + + // Quote — find the Quote node that the AIDA or Claim references. + const quote = findQuoteForAida(aida, quotes) ?? findRelated(outcome, quotes); + + // CiTO node whose citing entity (subject) is THIS outcome. This is the + // outcome-level CiTO citation that connects the Outcome to the upstream + // paper DOI and any artefact DOIs. + const cito = citos.find( + (c) => c.cito?.citingEntity === outcome.uri && c.uri !== entryUri, + ); + + // Research Software node that supports this Outcome (or any of its + // upstream Claim/Study). + const rs = researchSoftwares.find((r) => + r.researchSoftware?.supportsTargets.some((t) => { + const canon = canonicalNanopubUri(t); + return ( + canon === outcome.uri || + (claim && canon === claim.uri) || + (study && canon === study.uri) + ); + }), + ); + + const steps: ChainStep[] = []; + if (quote && quote.quote) + steps.push({ + step: "Quote", + uri: quote.uri, + text: quote.quote.quotedText, + targets: quote.quote.citedDoi ? [quote.quote.citedDoi] : [], + }); + if (aida && aida.aida) + steps.push({ step: "AIDA", uri: aida.uri, text: aida.aida.sentence }); + if (claim && claim.claim) + steps.push({ + step: "Claim", + uri: claim.uri, + type: claim.claim.claimType, + label: claim.label, + }); + if (study && study.study) + steps.push({ + step: "Study", + uri: study.uri, + scope: study.study.scope, + method: study.study.methodology, + deviations: study.study.deviations, + }); + steps.push({ + step: "Outcome", + uri: outcome.uri, + label: outcome.label, + verdict: out.validationStatus, + confidence: out.confidenceLevel, + conclusion: out.conclusion, + evidence: out.evidence, + limitations: out.limitations, + repository: out.repository, + }); + if (cito && cito.cito) + steps.push({ + step: "CiTO", + uri: cito.uri, + relations: cito.cito.relations, + targets: cito.cito.citedTargets, + }); + if (rs && rs.researchSoftware) + steps.push({ + step: "ResearchSoftware", + uri: rs.uri, + label: rs.label, + repository: rs.researchSoftware.repository, + zenodoDoi: rs.researchSoftware.zenodoDoi, + }); + + chains.push({ + id: outcome.uri.split("/").pop() ?? outcome.uri, + outcomeUri: outcome.uri, + outcomeVerdict: out.validationStatus, + outcomeConfidence: out.confidenceLevel, + citoRelations: cito?.cito?.relations ?? [], + steps, + }); + } + + return { chains, apexCito, researchSynthesis, paperDoi }; +} + +/** + * Pick the most likely primary paper DOI from the constellation's external + * citations. Heuristic: the DOI cited the most often across CiTO + Quote + * nodes, with Zenodo artefact DOIs deprioritised. + */ +function findPrimaryPaperDoi(nodes: ConstellationNode[]): string { + const counts = new Map(); + for (const n of nodes) { + const dois: string[] = []; + if (n.cito) dois.push(...n.cito.citedTargets); + if (n.quote?.citedDoi) dois.push(n.quote.citedDoi); + for (const d of dois) { + if (!d.startsWith("https://doi.org/") && !d.startsWith("http://doi.org/")) + continue; + counts.set(d, (counts.get(d) ?? 0) + 1); + } + } + const sorted = [...counts.entries()].sort((a, b) => { + const aArtefact = /10\.5281\/zenodo/.test(a[0]) ? 1 : 0; + const bArtefact = /10\.5281\/zenodo/.test(b[0]) ? 1 : 0; + if (aArtefact !== bArtefact) return aArtefact - bArtefact; + return b[1] - a[1]; + }); + return sorted[0]?.[0] ?? ""; +} + +/** + * For a chain-step node (typically an Outcome), pick the most plausible + * related node of the given kind. Falls back to "the only node of that + * kind in the constellation" if there's just one; otherwise returns null. + * This is the conservative path when explicit predicate-linkage failed. + */ +function findRelated( + _from: ConstellationNode, + candidates: ConstellationNode[], +): ConstellationNode | undefined { + if (candidates.length === 1) return candidates[0]; + return undefined; +} + +function findAidaForClaim( + claim: ConstellationNode | undefined, + aidas: ConstellationNode[], +): ConstellationNode | undefined { + if (!claim?.claim?.aidaStatement) return undefined; + const target = claim.claim.aidaStatement; + for (const a of aidas) { + // The AIDA URI lives in the AIDA nanopub's TriG; we approximated by + // storing the decoded sentence in `aida.sentence`. Compare URL-decoded + // forms of the target's path against the AIDA's sentence. + if (a.uri.includes(target)) return a; + try { + const decoded = decodeURIComponent( + target.replace(/^http:\/\/purl\.org\/aida\//, ""), + ); + if (a.aida?.sentence && a.aida.sentence === decoded) return a; + } catch { + // fall through + } + } + return undefined; +} + +function findQuoteForAida( + aida: ConstellationNode | undefined, + quotes: ConstellationNode[], +): ConstellationNode | undefined { + if (!aida) return quotes.length === 1 ? quotes[0] : undefined; + // No explicit predicate for AIDA→Quote in FORRT. The AIDA's TriG usually + // mentions the Quote nanopub URI. We don't have the raw TriG here, but + // the AIDA's plainTextExcerpts won't help — instead fall back to "only + // Quote in the constellation" when there's exactly one. + if (quotes.length === 1) return quotes[0]; + return undefined; +} + +/** + * Run an async worker over `items` with bounded `limit` concurrency. Order + * of results matches input order. Returns null in positions where the worker + * returned null/undefined. + */ +async function runWithConcurrency( + items: T[], + limit: number, + worker: (item: T) => Promise, +): Promise<(R | null)[]> { + const results: (R | null)[] = new Array(items.length).fill(null); + let next = 0; + const runners: Promise[] = []; + for (let i = 0; i < Math.max(1, Math.min(limit, items.length)); i++) { + runners.push( + (async () => { + while (true) { + const idx = next++; + if (idx >= items.length) return; + results[idx] = await worker(items[idx]); + } + })(), + ); + } + await Promise.all(runners); + return results; +} diff --git a/api/src/np/index.test.ts b/api/src/np/index.test.ts new file mode 100644 index 0000000..cd7c5a1 --- /dev/null +++ b/api/src/np/index.test.ts @@ -0,0 +1,425 @@ +/** + * Tests for the /np Hono sub-app — HTTP routing, auth gate, query-string + * validation, parameter clamping, error mapping. + * + * Uses Hono's in-process `app.request()` for handler invocation. Wraps the + * sub-app in a parent app that sets `c.var.user`, mirroring what the + * better-auth middleware in `api/src/index.ts` does in production. Mocks + * `buildConstellation` so we exercise the HTTP layer in isolation from the + * actual KP traversal. + */ +import { Hono } from "hono"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +const buildConstellationMock = vi.fn(); +vi.mock("./constellation", () => ({ + buildConstellation: (...args: unknown[]) => buildConstellationMock(...args), +})); + +// Import the sub-app AFTER the mock so its module-load picks up the stub. +const { default: npApp } = await import("./index"); + +type FakeUser = { id: string } | null; + +type ParentVars = { + Variables: { + user: FakeUser; + session: unknown | null; + }; +}; + +/** Build a parent Hono app that injects `user` and mounts /np like prod does. */ +function mountWithUser(user: FakeUser) { + const parent = new Hono(); + parent.use("*", async (c, next) => { + c.set("user", user); + await next(); + }); + parent.route("/np", npApp); + return parent; +} + +const CONSTELLATION_PAYLOAD = { + entry: "https://w3id.org/sciencelive/np/RAabc", + nodeCount: 3, + edgeCount: 2, + sparqlEndpoint: "https://query.knowledgepixels.com/repo/full", + nodes: [], + edges: [], + externalCitations: [], +}; + +describe("GET /np/constellation", () => { + beforeEach(() => { + buildConstellationMock.mockReset(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + it("returns 401 when no user is signed in", async () => { + const app = mountWithUser(null); + const res = await app.request( + "/np/constellation?uri=https://w3id.org/sciencelive/np/RAabcdefghijklmnopqrstuvwxyz0000000", + ); + expect(res.status).toBe(401); + const body = (await res.json()) as { error: string }; + expect(body.error).toMatch(/Unauthorized/); + expect(buildConstellationMock).not.toHaveBeenCalled(); + }); + + it("returns 400 when the uri query parameter is missing", async () => { + const app = mountWithUser({ id: "u1" }); + const res = await app.request("/np/constellation"); + expect(res.status).toBe(400); + const body = (await res.json()) as { error: string }; + expect(body.error).toMatch(/Missing 'uri'/); + expect(buildConstellationMock).not.toHaveBeenCalled(); + }); + + it("returns 400 when the uri is not a nanopub URI", async () => { + const app = mountWithUser({ id: "u1" }); + const res = await app.request( + "/np/constellation?uri=https://example.com/not-a-nanopub", + ); + expect(res.status).toBe(400); + const body = (await res.json()) as { error: string }; + expect(body.error).toMatch(/does not look like a nanopub URI/); + expect(buildConstellationMock).not.toHaveBeenCalled(); + }); + + it("returns 200 with the constellation JSON when uri is valid", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + const res = await app.request( + "/np/constellation?uri=https://w3id.org/sciencelive/np/RAabcdefghijklmnopqrstuvwxyz0000000", + ); + expect(res.status).toBe(200); + const body = await res.json(); + expect(body).toEqual(CONSTELLATION_PAYLOAD); + expect(buildConstellationMock).toHaveBeenCalledTimes(1); + }); + + it("strips fragments from the uri before passing to buildConstellation", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + await app.request( + "/np/constellation?uri=https://w3id.org/sciencelive/np/RAabcdefghijklmnopqrstuvwxyz0000000%23assertion", + ); + expect(buildConstellationMock).toHaveBeenCalledWith( + "https://w3id.org/sciencelive/np/RAabcdefghijklmnopqrstuvwxyz0000000", + expect.objectContaining({ depthLimit: 5, maxNodes: 80 }), + ); + }); + + it("clamps depth and maxNodes to their valid ranges", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + await app.request( + "/np/constellation?uri=https://w3id.org/sciencelive/np/RAabcdefghijklmnopqrstuvwxyz0000000&depth=999&maxNodes=999", + ); + expect(buildConstellationMock).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ depthLimit: 10, maxNodes: 200 }), + ); + }); + + it("falls back to defaults for non-numeric depth/maxNodes", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + await app.request( + "/np/constellation?uri=https://w3id.org/sciencelive/np/RAabcdefghijklmnopqrstuvwxyz0000000&depth=banana&maxNodes=", + ); + expect(buildConstellationMock).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ depthLimit: 5, maxNodes: 80 }), + ); + }); + + it("returns 502 when buildConstellation throws", async () => { + buildConstellationMock.mockRejectedValueOnce( + new Error("KP unreachable after retries"), + ); + const app = mountWithUser({ id: "u1" }); + const res = await app.request( + "/np/constellation?uri=https://w3id.org/sciencelive/np/RAabcdefghijklmnopqrstuvwxyz0000000", + ); + expect(res.status).toBe(502); + const body = (await res.json()) as { error: string }; + expect(body.error).toMatch(/KP unreachable/); + }); +}); + +// ============================================================================= +// BOUNDARY + EDGE CASES — round 2 +// ============================================================================= + +const VALID_URI = + "https://w3id.org/sciencelive/np/RAabcdefghijklmnopqrstuvwxyz0000000"; + +describe("non-GET methods on /np/constellation", () => { + beforeEach(() => buildConstellationMock.mockReset()); + afterEach(() => vi.restoreAllMocks()); + + it("returns 404 for POST", async () => { + const app = mountWithUser({ id: "u1" }); + const res = await app.request("/np/constellation?uri=" + VALID_URI, { + method: "POST", + }); + // The /constellation route only declares GET — Hono responds 404. + expect(res.status).toBe(404); + expect(buildConstellationMock).not.toHaveBeenCalled(); + }); + + it("returns 404 for PUT", async () => { + const app = mountWithUser({ id: "u1" }); + const res = await app.request("/np/constellation?uri=" + VALID_URI, { + method: "PUT", + }); + expect(res.status).toBe(404); + }); + + it("returns 404 for DELETE", async () => { + const app = mountWithUser({ id: "u1" }); + const res = await app.request("/np/constellation?uri=" + VALID_URI, { + method: "DELETE", + }); + expect(res.status).toBe(404); + }); +}); + +describe("depth and maxNodes boundary clamping", () => { + beforeEach(() => buildConstellationMock.mockReset()); + afterEach(() => vi.restoreAllMocks()); + + it("clamps depth=0 to 1 (lower bound)", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + await app.request( + `/np/constellation?uri=${VALID_URI}&depth=0`, + ); + expect(buildConstellationMock).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ depthLimit: 1 }), + ); + }); + + it("clamps depth=-5 to 1 (negative lower bound)", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + await app.request( + `/np/constellation?uri=${VALID_URI}&depth=-5`, + ); + expect(buildConstellationMock).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ depthLimit: 1 }), + ); + }); + + it("clamps maxNodes=0 to 1 (lower bound)", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + await app.request( + `/np/constellation?uri=${VALID_URI}&maxNodes=0`, + ); + expect(buildConstellationMock).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ maxNodes: 1 }), + ); + }); + + it("accepts depth at the exact lower bound (1)", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + await app.request( + `/np/constellation?uri=${VALID_URI}&depth=1&maxNodes=1`, + ); + expect(buildConstellationMock).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ depthLimit: 1, maxNodes: 1 }), + ); + }); + + it("accepts depth at the exact upper bound (10)", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + await app.request( + `/np/constellation?uri=${VALID_URI}&depth=10&maxNodes=200`, + ); + expect(buildConstellationMock).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ depthLimit: 10, maxNodes: 200 }), + ); + }); + + it("ignores trailing non-numeric chars in depth (parseInt picks the prefix)", async () => { + // Number.parseInt("3abc", 10) === 3 — this is documented JS behaviour. + // Lock it in so a future regex tightening doesn't break the contract. + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + await app.request( + `/np/constellation?uri=${VALID_URI}&depth=3abc`, + ); + expect(buildConstellationMock).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ depthLimit: 3 }), + ); + }); +}); + +describe("/np route coverage", () => { + beforeEach(() => buildConstellationMock.mockReset()); + afterEach(() => vi.restoreAllMocks()); + + it("returns 404 for unknown paths under /np", async () => { + const app = mountWithUser({ id: "u1" }); + const res = await app.request("/np/does-not-exist"); + expect(res.status).toBe(404); + }); + + it("returns 404 for /np with no trailing path", async () => { + const app = mountWithUser({ id: "u1" }); + const res = await app.request("/np"); + expect(res.status).toBe(404); + }); + + it("returns application/json content-type on success", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + const res = await app.request(`/np/constellation?uri=${VALID_URI}`); + expect(res.headers.get("content-type")).toMatch(/application\/json/); + }); + + it("returns application/json content-type on error", async () => { + buildConstellationMock.mockRejectedValueOnce(new Error("boom")); + const app = mountWithUser({ id: "u1" }); + const res = await app.request(`/np/constellation?uri=${VALID_URI}`); + expect(res.headers.get("content-type")).toMatch(/application\/json/); + }); + + it("falls back to a generic message when buildConstellation throws non-Error", async () => { + buildConstellationMock.mockRejectedValueOnce("plain string thrown"); + const app = mountWithUser({ id: "u1" }); + const res = await app.request(`/np/constellation?uri=${VALID_URI}`); + expect(res.status).toBe(502); + const body = (await res.json()) as { error: string }; + expect(body.error).toMatch(/Constellation build failed/); + }); +}); + +// ============================================================================= +// ROUND 3 — protocol-level gaps +// ============================================================================= + +describe("/np/constellation protocol gaps", () => { + beforeEach(() => buildConstellationMock.mockReset()); + afterEach(() => vi.restoreAllMocks()); + + it("HEAD requests pass through Hono — return same response shape as GET (no body)", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + const res = await app.request( + `/np/constellation?uri=${VALID_URI}`, + { method: "HEAD" }, + ); + // Hono routes only respond to GET (we declared `app.get()`). HEAD is + // typically auto-handled by Hono via the matching GET route, but our + // route's `get()` only matches GET — HEAD returns 404 unless explicitly + // declared. PIN current behaviour. + expect([200, 404]).toContain(res.status); + }); + + it("URI at exact 20-char hash boundary passes through the HTTP route", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + const minLengthUri = "https://w3id.org/np/RA" + "X".repeat(20); + const res = await app.request( + `/np/constellation?uri=${encodeURIComponent(minLengthUri)}`, + ); + expect(res.status).toBe(200); + expect(buildConstellationMock).toHaveBeenCalledWith( + minLengthUri, + expect.any(Object), + ); + }); + + it("URI one char short of the hash boundary returns 400", async () => { + const app = mountWithUser({ id: "u1" }); + const tooShort = "https://w3id.org/np/RA" + "X".repeat(19); + const res = await app.request( + `/np/constellation?uri=${encodeURIComponent(tooShort)}`, + ); + expect(res.status).toBe(400); + expect(buildConstellationMock).not.toHaveBeenCalled(); + }); + + it("multiple `uri` query params — Hono picks one deterministically", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + const a = "https://w3id.org/sciencelive/np/RAaaa00000000000000000000000000000000000000"; + const b = "https://w3id.org/sciencelive/np/RAbbb00000000000000000000000000000000000000"; + await app.request( + `/np/constellation?uri=${encodeURIComponent(a)}&uri=${encodeURIComponent(b)}`, + ); + expect(buildConstellationMock).toHaveBeenCalledTimes(1); + const calledWith = buildConstellationMock.mock.calls[0][0]; + // Hono returns the FIRST value for repeated query params; lock it in. + expect([a, b]).toContain(calledWith); + }); + + it("serialises a large structured response correctly through c.json()", async () => { + // Synthesize a payload roughly the size of a real 19-node Bombus chain + // and verify it round-trips through the HTTP response unchanged. + const bigPayload = { + ...CONSTELLATION_PAYLOAD, + nodeCount: 19, + nodes: Array.from({ length: 19 }, (_, i) => ({ + uri: `https://w3id.org/sciencelive/np/RAnode${i}${"0".repeat(35)}`, + stepKind: "outcome", + stepType: "Declaring a replication study outcome according to FORRT", + templateUri: "", + label: `Outcome ${i}`, + date: "2026-05-11T00:00:00Z", + creators: ["https://orcid.org/0000-0002-1784-2920"], + authorsOrcid: [], + plainTextExcerpts: ["A".repeat(2000)], + githubUrls: [], + })), + chains: [ + { + id: "chain-0", + outcomeUri: "u", + outcomeVerdict: "Validated", + outcomeConfidence: "HighConfidence", + citoRelations: ["confirms"], + steps: Array.from({ length: 7 }, (_, i) => ({ + step: "Outcome", + uri: `u-${i}`, + text: "long text ".repeat(100), + })), + }, + ], + }; + buildConstellationMock.mockResolvedValueOnce(bigPayload); + const app = mountWithUser({ id: "u1" }); + const res = await app.request(`/np/constellation?uri=${VALID_URI}`); + expect(res.status).toBe(200); + const body = await res.json(); + expect(body).toEqual(bigPayload); + }); + + it("URL-encoded URI with %2F path separators decodes properly through Hono", async () => { + buildConstellationMock.mockResolvedValueOnce(CONSTELLATION_PAYLOAD); + const app = mountWithUser({ id: "u1" }); + // %2F is the URL-encoded form of /. A buggy URL parser might double- + // decode and lose path segments. + await app.request( + `/np/constellation?uri=https%3A%2F%2Fw3id.org%2Fsciencelive%2Fnp%2FRAabcdefghijklmnopqrstuvwxyz0000000`, + ); + expect(buildConstellationMock).toHaveBeenCalledWith( + "https://w3id.org/sciencelive/np/RAabcdefghijklmnopqrstuvwxyz0000000", + expect.any(Object), + ); + }); +}); diff --git a/api/src/np/index.ts b/api/src/np/index.ts new file mode 100644 index 0000000..46fe850 --- /dev/null +++ b/api/src/np/index.ts @@ -0,0 +1,74 @@ +import { Session, User } from "better-auth"; +import { Hono } from "hono"; +import { buildConstellation } from "./constellation"; +import { canonicalNanopubUri } from "./trig"; + +const app = new Hono<{ + Bindings: Env; + Variables: { + user: User | null; + session: Session | null; + }; +}>(); + +/** + * GET /np/constellation?uri=&depth=&maxNodes= + * + * Bidirectional FORRT-chain traversal from the given nanopub URI. Returns + * the full reachable constellation as flat `nodes` + `edges` arrays plus + * external (non-nanopub) citation URIs (typically DOIs). + * + * Auth: signed-in users only for v1 (sits behind the better-auth session + * middleware in api/src/index.ts). API-key auth + paid-tier gating arrive + * in Week 3 of docs/plans/nanopub-query-api.md. + * + * NOTE: the plan specifies `GET /api/np/{uri}/constellation` (path param). + * Encoded URIs in path params are fragile, so v1 uses a query param. The + * path-param form can be added as an alias later. + */ +app.get("/constellation", async (c) => { + const user = c.get("user"); + if (!user) return c.json({ error: "Unauthorized" }, 401); + + const rawUri = c.req.query("uri"); + if (!rawUri) return c.json({ error: "Missing 'uri' query parameter" }, 400); + + const entry = canonicalNanopubUri(rawUri); + if (!entry) { + return c.json( + { + error: `'${rawUri}' does not look like a nanopub URI (expected https://w3id.org/[sciencelive/]np/RA…).`, + }, + 400, + ); + } + + const depthLimit = clampInt(c.req.query("depth"), 1, 10, 5); + const maxNodes = clampInt(c.req.query("maxNodes"), 1, 200, 80); + + try { + const constellation = await buildConstellation(entry, { + depthLimit, + maxNodes, + }); + return c.json(constellation); + } catch (err) { + const message = + err instanceof Error ? err.message : "Constellation build failed"; + return c.json({ error: message }, 502); + } +}); + +function clampInt( + raw: string | undefined, + min: number, + max: number, + fallback: number, +): number { + if (!raw) return fallback; + const n = Number.parseInt(raw, 10); + if (!Number.isFinite(n)) return fallback; + return Math.max(min, Math.min(max, n)); +} + +export default app; diff --git a/api/src/np/queries.ts b/api/src/np/queries.ts new file mode 100644 index 0000000..06bfa94 --- /dev/null +++ b/api/src/np/queries.ts @@ -0,0 +1,88 @@ +/** + * SPARQL queries used by the /api/np endpoints. + * + * These are inline string copies of the canonical .rq files in + * `frontend/src/lib/queries/` (and `forrt-replication-template/scripts/queries/`). + * Until the wrangler build is configured to use the shared + * `@sciencelive/sparql-plugin/esbuild` loader, mirror any upstream edit here. + * + * Single source of truth: `frontend/src/lib/queries/*.rq`. + */ + +export const NANOPUB_SPARQL_ENDPOINT_FULL = + "https://query.knowledgepixels.com/repo/full"; + +/** + * Source: frontend/src/lib/queries/nanopub-references.rq + * Returns nanopubs that REFER TO ?_nanopubUri (upstream / incoming edges). + */ +export const REFERENCES_TO = ` +prefix rdfs: +prefix np: +prefix npa: +prefix npx: +prefix dct: +prefix nt: + +select ?np ?label ?date ?creator ?template where { + graph npa:networkGraph { + ?np npa:refersToNanopub ?_nanopubUri . + } + graph npa:graph { + ?np npa:hasValidSignatureForPublicKeyHash ?pubkey . + filter not exists { ?npx npx:invalidates ?np ; npa:hasValidSignatureForPublicKeyHash ?pubkey . } + filter not exists { ?np npx:invalidates ?_nanopubUri . } + optional { ?np rdfs:label ?label . } + ?np np:hasAssertion ?assertion ; + dct:created ?date ; + dct:creator ?creator . + } + filter not exists { graph ?assertion { ?_nanopubUri rdfs:comment ?_s . } } + filter not exists { graph ?assertion { ?approver npx:approvesOf ?_nanopubUri . } } + filter not exists { graph ?assertion { ?disapprover npx:disapprovesOf ?_nanopubUri . } } + optional { graph npa:networkGraph { ?np nt:wasCreatedFromTemplate ?template . } } +} +order by desc(?date) +limit 100 +`; + +/** + * Source: forrt-replication-template/scripts/queries/references-from.rq + * Returns nanopubs that ?_nanopubUri REFERS TO (downstream / outgoing edges). + */ +export const REFERENCES_FROM = ` +prefix rdfs: +prefix np: +prefix npa: +prefix npx: +prefix dct: +prefix nt: + +select ?np ?label ?date ?creator ?template where { + graph npa:networkGraph { + ?_nanopubUri npa:refersToNanopub ?np . + } + graph npa:graph { + ?np npa:hasValidSignatureForPublicKeyHash ?pubkey . + filter not exists { ?npx npx:invalidates ?np ; npa:hasValidSignatureForPublicKeyHash ?pubkey . } + filter not exists { ?np npx:invalidates ?_nanopubUri . } + optional { ?np rdfs:label ?label . } + ?np np:hasAssertion ?assertion ; + dct:created ?date ; + dct:creator ?creator . + } + filter not exists { graph ?assertion { ?_nanopubUri rdfs:comment ?_s . } } + filter not exists { graph ?assertion { ?approver npx:approvesOf ?_nanopubUri . } } + filter not exists { graph ?assertion { ?disapprover npx:disapprovesOf ?_nanopubUri . } } + optional { graph npa:networkGraph { ?np nt:wasCreatedFromTemplate ?template . } } +} +order by desc(?date) +limit 100 +`; + +/** + * Substitute the `?_nanopubUri` placeholder with a bracketed URI literal. + */ +export function bindUri(query: string, nanopubUri: string): string { + return query.replaceAll("?_nanopubUri", `<${nanopubUri}>`); +} diff --git a/api/src/np/sparql.test.ts b/api/src/np/sparql.test.ts new file mode 100644 index 0000000..a3586c1 --- /dev/null +++ b/api/src/np/sparql.test.ts @@ -0,0 +1,374 @@ +/** + * Unit tests for executeSparql + fetchTrig retry behaviour. + * + * Mocks global fetch via vi.stubGlobal so each test is deterministic. Pins + * down the bug class we hit during dev — KP returning 503 under concurrent + * load, masked by silent catches in the BFS. We want explicit guarantees + * that: + * - executeSparql retries on transient 5xx + succeeds on later attempt + * - hard 4xx fails immediately (no retry) + * - fetchTrig has matching retry semantics + * - HTML body from the resolver fails fast (URI-form mismatch) + */ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { executeSparql, fetchTrig, resolverUrl } from "./sparql"; + +const SPARQL_OK_BODY = { + results: { + bindings: [ + { + np: { + type: "uri", + value: "https://w3id.org/sciencelive/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8", + }, + }, + ], + }, +}; + +function mockResponse( + status: number, + bodyText: string, + contentType = "application/sparql-results+json", +): Response { + return new Response(bodyText, { + status, + headers: { "content-type": contentType }, + }); +} + +describe("executeSparql", () => { + let fetchMock: ReturnType; + + beforeEach(() => { + fetchMock = vi.fn(); + vi.stubGlobal("fetch", fetchMock); + }); + + afterEach(() => { + vi.unstubAllGlobals(); + }); + + it("returns parsed bindings on a clean 200", async () => { + fetchMock.mockResolvedValueOnce( + mockResponse(200, JSON.stringify(SPARQL_OK_BODY)), + ); + const rows = await executeSparql("SELECT ?np WHERE { ?np a ?x }"); + expect(rows).toEqual([ + { + np: "https://w3id.org/sciencelive/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8", + }, + ]); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("retries on transient 503 and succeeds on next attempt", async () => { + fetchMock + .mockResolvedValueOnce(mockResponse(503, "upstream busy")) + .mockResolvedValueOnce( + mockResponse(200, JSON.stringify(SPARQL_OK_BODY)), + ); + const rows = await executeSparql("SELECT ?np WHERE { ?np a ?x }"); + expect(rows.length).toBe(1); + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + + it("retries 5xx up to MAX_RETRIES then throws", async () => { + fetchMock.mockResolvedValue(mockResponse(502, "bad gateway")); + await expect( + executeSparql("SELECT ?np WHERE { ?np a ?x }"), + ).rejects.toThrow(/transient 502|after retries/); + // Initial attempt + MAX_RETRIES (3) = 4 total calls + expect(fetchMock).toHaveBeenCalledTimes(4); + }); + + it("does NOT retry on a hard 4xx", async () => { + fetchMock.mockResolvedValueOnce(mockResponse(400, "malformed query")); + await expect( + executeSparql("SELECT ?np WHERE { ?np a ?x }"), + ).rejects.toThrow(/SPARQL query failed: 400/); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("propagates AbortError without retrying", async () => { + const abortError = new Error("aborted"); + abortError.name = "AbortError"; + fetchMock.mockRejectedValueOnce(abortError); + await expect( + executeSparql("SELECT ?np WHERE { ?np a ?x }"), + ).rejects.toThrow(/aborted/); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); +}); + +describe("resolverUrl", () => { + it("rewrites sciencelive/ form to bare np/ resolver", () => { + expect( + resolverUrl( + "https://w3id.org/sciencelive/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8", + ), + ).toBe( + "https://w3id.org/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8", + ); + }); + + it("leaves bare np/ URIs unchanged", () => { + const u = "https://w3id.org/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8"; + expect(resolverUrl(u)).toBe(u); + }); +}); + +describe("fetchTrig", () => { + let fetchMock: ReturnType; + + beforeEach(() => { + fetchMock = vi.fn(); + vi.stubGlobal("fetch", fetchMock); + }); + + afterEach(() => { + vi.unstubAllGlobals(); + }); + + const TRIG_OK = `@prefix this: . +sub:Head { this: a np:Nanopublication . }`; + + it("returns the body on a clean 200", async () => { + fetchMock.mockResolvedValueOnce( + mockResponse(200, TRIG_OK, "application/trig"), + ); + const body = await fetchTrig("https://w3id.org/np/RAX"); + expect(body).toBe(TRIG_OK); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("retries 5xx then succeeds", async () => { + fetchMock + .mockResolvedValueOnce(mockResponse(503, "busy")) + .mockResolvedValueOnce( + mockResponse(200, TRIG_OK, "application/trig"), + ); + const body = await fetchTrig("https://w3id.org/np/RAX"); + expect(body).toBe(TRIG_OK); + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + + it("throws on HTML response (URI form not supported by resolver)", async () => { + fetchMock.mockResolvedValueOnce( + mockResponse( + 200, + "nope", + "text/html", + ), + ); + await expect(fetchTrig("https://w3id.org/np/RAX")).rejects.toThrow( + /Resolver returned HTML/, + ); + }); + + it("fails immediately on a hard 404", async () => { + fetchMock.mockResolvedValueOnce(mockResponse(404, "not found")); + await expect(fetchTrig("https://w3id.org/np/RAX")).rejects.toThrow( + /TriG fetch failed/, + ); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("rejects an empty body that starts with { + fetchMock.mockResolvedValueOnce( + mockResponse(200, "...", "text/html"), + ); + await expect(fetchTrig("https://w3id.org/np/RAX")).rejects.toThrow( + /Resolver returned HTML/, + ); + }); + + it("retries 5xx until exhausted then throws", async () => { + fetchMock.mockResolvedValue(mockResponse(503, "busy")); + await expect(fetchTrig("https://w3id.org/np/RAX")).rejects.toThrow( + /transient 503|after retries/, + ); + expect(fetchMock).toHaveBeenCalledTimes(4); + }); + + it("propagates AbortError without retrying", async () => { + const err = new Error("aborted"); + err.name = "AbortError"; + fetchMock.mockRejectedValueOnce(err); + await expect(fetchTrig("https://w3id.org/np/RAX")).rejects.toThrow( + /aborted/, + ); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); +}); + +// ============================================================================= +// BOUNDARY + EDGE CASES — round 2 +// ============================================================================= + +describe("executeSparql edge cases", () => { + let fetchMock: ReturnType; + + beforeEach(() => { + fetchMock = vi.fn(); + vi.stubGlobal("fetch", fetchMock); + }); + + afterEach(() => { + vi.unstubAllGlobals(); + }); + + it("returns [] when the SPARQL response has no bindings", async () => { + fetchMock.mockResolvedValueOnce( + mockResponse(200, JSON.stringify({ results: { bindings: [] } })), + ); + const rows = await executeSparql("SELECT ?np WHERE { ?np a ?x }"); + expect(rows).toEqual([]); + }); + + it("returns [] when the response is missing the results key entirely", async () => { + fetchMock.mockResolvedValueOnce(mockResponse(200, JSON.stringify({}))); + const rows = await executeSparql("SELECT ?np WHERE { ?np a ?x }"); + expect(rows).toEqual([]); + }); + + it("propagates a JSON parse failure as a thrown error", async () => { + fetchMock.mockResolvedValueOnce(mockResponse(200, "this is not json")); + await expect( + executeSparql("SELECT ?np WHERE { ?np a ?x }"), + ).rejects.toThrow(); + }); + + it("retries through multiple sequential 5xxs and succeeds on the 4th attempt", async () => { + fetchMock + .mockResolvedValueOnce(mockResponse(502, "x")) + .mockResolvedValueOnce(mockResponse(503, "x")) + .mockResolvedValueOnce(mockResponse(504, "x")) + .mockResolvedValueOnce( + mockResponse(200, JSON.stringify(SPARQL_OK_BODY)), + ); + const rows = await executeSparql("SELECT ?np WHERE { ?np a ?x }"); + expect(rows.length).toBe(1); + expect(fetchMock).toHaveBeenCalledTimes(4); + }); + + it("retries on a fetch-level network throw (not an HTTP response)", async () => { + fetchMock + .mockRejectedValueOnce(new TypeError("fetch failed")) + .mockResolvedValueOnce( + mockResponse(200, JSON.stringify(SPARQL_OK_BODY)), + ); + const rows = await executeSparql("SELECT ?np WHERE { ?np a ?x }"); + expect(rows.length).toBe(1); + expect(fetchMock).toHaveBeenCalledTimes(2); + }); +}); + +describe("resolverUrl edge cases", () => { + it("does not double-rewrite when called twice", () => { + const sl = + "https://w3id.org/sciencelive/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8"; + const once = resolverUrl(sl); + const twice = resolverUrl(once); + expect(twice).toBe(once); + }); + + it("does NOT alter unrelated URIs", () => { + const other = "https://example.org/foo"; + expect(resolverUrl(other)).toBe(other); + }); +}); + +// ============================================================================= +// ROUND 3 — gap-filling failure modes +// ============================================================================= + +describe("executeSparql round-3 failure modes", () => { + let fetchMock: ReturnType; + beforeEach(() => { + fetchMock = vi.fn(); + vi.stubGlobal("fetch", fetchMock); + }); + afterEach(() => vi.unstubAllGlobals()); + + it("throws (not retries) on a 200 response with non-JSON body", async () => { + fetchMock.mockResolvedValueOnce(mockResponse(200, "nope")); + await expect( + executeSparql("SELECT ?np WHERE { ?np a ?x }"), + ).rejects.toThrow(); + // JSON.parse throws synchronously after the 200 check — no retry. + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("returns [] gracefully on a 200 response with empty bindings array", async () => { + fetchMock.mockResolvedValueOnce( + mockResponse(200, JSON.stringify({ results: { bindings: [] } })), + ); + expect(await executeSparql("SELECT ?np WHERE { ?np a ?x }")).toEqual([]); + }); + + it("returns rows with undefined value fields when bindings rows lack `value`", async () => { + // SPARQL endpoints SHOULD return `{ value: "..." }` per the JSON results + // spec but defend against partial data. + fetchMock.mockResolvedValueOnce( + mockResponse( + 200, + JSON.stringify({ + results: { + bindings: [{ np: { type: "uri" /* no value */ } }], + }, + }), + ), + ); + const rows = await executeSparql("SELECT ?np WHERE { ?np a ?x }"); + expect(Array.isArray(rows)).toBe(true); + expect(rows[0]?.np).toBeUndefined(); + }); + + it("uses the abort signal when supplied", async () => { + const controller = new AbortController(); + fetchMock.mockImplementationOnce((_url, init) => { + expect(init?.signal).toBe(controller.signal); + controller.abort(); + const err = new Error("aborted"); + err.name = "AbortError"; + throw err; + }); + await expect( + executeSparql("SELECT ?np WHERE { ?np a ?x }", controller.signal), + ).rejects.toThrow(); + }); +}); + +describe("fetchTrig round-3 failure modes", () => { + let fetchMock: ReturnType; + beforeEach(() => { + fetchMock = vi.fn(); + vi.stubGlobal("fetch", fetchMock); + }); + afterEach(() => vi.unstubAllGlobals()); + + it("detects HTML even with leading whitespace before doctype", async () => { + fetchMock.mockResolvedValueOnce( + mockResponse(200, " \n \n", "application/trig"), + ); + await expect(fetchTrig("https://w3id.org/np/RAX")).rejects.toThrow( + /Resolver returned HTML/, + ); + }); + + it("does NOT treat a TriG body that mentions later as HTML", async () => { + // A literal value like "" inside a TriG string should NOT trigger + // the HTML guard — only the FIRST 32 chars are checked. + const trig = `@prefix : <#> .\nsub:x rdfs:label "this mentions " .`; + fetchMock.mockResolvedValueOnce( + mockResponse(200, trig, "application/trig"), + ); + expect(await fetchTrig("https://w3id.org/np/RAX")).toBe(trig); + }); + + it("returns an empty body on a 200 with no content", async () => { + fetchMock.mockResolvedValueOnce(mockResponse(200, "", "application/trig")); + expect(await fetchTrig("https://w3id.org/np/RAX")).toBe(""); + }); +}); diff --git a/api/src/np/sparql.ts b/api/src/np/sparql.ts new file mode 100644 index 0000000..8f8e743 --- /dev/null +++ b/api/src/np/sparql.ts @@ -0,0 +1,137 @@ +import { NANOPUB_SPARQL_ENDPOINT_FULL } from "./queries"; + +type SparqlBindingValue = { type: string; value: string }; +type SparqlResults = { + results: { bindings: Record[] }; +}; + +const MAX_RETRIES = 3; +const RETRY_BASE_MS = 600; + +/** + * POST a SPARQL query to the KnowledgePixels endpoint and return rows as + * `{ var: stringValue }` objects. Retries on transient 5xx / network errors + * with exponential backoff — KP's nginx returns intermittent 503s under + * concurrent load. + */ +export async function executeSparql( + query: string, + signal?: AbortSignal, +): Promise[]> { + let lastError: unknown = null; + for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) { + if (attempt > 0) { + const wait = RETRY_BASE_MS * 2 ** (attempt - 1); + await new Promise((resolve) => setTimeout(resolve, wait)); + } + // Only the network call is in try/catch — body validation throws + // non-retryable errors that must bubble out. + let res: Response; + try { + res = await fetch(NANOPUB_SPARQL_ENDPOINT_FULL, { + method: "POST", + body: new URLSearchParams({ query }), + headers: { + Accept: "application/sparql-results+json", + "Content-Type": "application/x-www-form-urlencoded", + "User-Agent": "science-live-platform-api/np-constellation", + }, + signal, + }); + } catch (e) { + if (e instanceof Error && e.name === "AbortError") throw e; + lastError = e; + continue; + } + if (res.status >= 500 && res.status < 600) { + lastError = new Error( + `SPARQL transient ${res.status} ${res.statusText}`, + ); + continue; + } + if (!res.ok) { + const detail = await res.text().catch(() => res.statusText); + throw new Error( + `SPARQL query failed: ${res.status} ${res.statusText}: ${detail.slice(0, 200)}`, + ); + } + const data = (await res.json()) as SparqlResults; + return (data.results?.bindings ?? []).map((row) => { + const out: Record = {}; + for (const [k, v] of Object.entries(row)) out[k] = v.value; + return out; + }); + } + throw lastError instanceof Error + ? lastError + : new Error("SPARQL query failed after retries"); +} + +/** + * The W3ID resolver serves TriG for `https://w3id.org/np/RA…` URIs. + * Science Live URIs of the form `https://w3id.org/sciencelive/np/RA…` + * redirect to the platform's HTML viewer, so swap the prefix before fetching. + */ +export function resolverUrl(uri: string): string { + return uri.replace( + "https://w3id.org/sciencelive/np/", + "https://w3id.org/np/", + ); +} + +/** + * Fetch a nanopub URI as TriG. Throws if the resolver returns HTML + * (a sign the URI form isn't supported by the W3ID redirect). Retries + * transient 5xx the same way executeSparql does. + */ +export async function fetchTrig( + uri: string, + signal?: AbortSignal, +): Promise { + let lastError: unknown = null; + for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) { + if (attempt > 0) { + const wait = RETRY_BASE_MS * 2 ** (attempt - 1); + await new Promise((resolve) => setTimeout(resolve, wait)); + } + let res: Response; + try { + res = await fetch(resolverUrl(uri), { + headers: { + Accept: "application/trig, application/n-quads;q=0.9, */*;q=0.5", + "User-Agent": "science-live-platform-api/np-constellation", + }, + signal, + }); + } catch (e) { + if (e instanceof Error && e.name === "AbortError") throw e; + lastError = e; + continue; + } + if (res.status >= 500 && res.status < 600) { + lastError = new Error( + `TriG fetch transient ${res.status} ${res.statusText}`, + ); + continue; + } + if (!res.ok) { + throw new Error( + `TriG fetch failed for ${uri}: ${res.status} ${res.statusText}`, + ); + } + const body = await res.text(); + const stripped = body.trimStart().slice(0, 32).toLowerCase(); + if ( + stripped.startsWith(" { + it("matches the SL-prefixed nanopub URI form", () => { + expect( + canonicalNanopubUri( + "https://w3id.org/sciencelive/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8", + ), + ).toBe( + "https://w3id.org/sciencelive/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8", + ); + }); + + it("matches the generic nanopub URI form", () => { + expect( + canonicalNanopubUri( + "https://w3id.org/np/RA43F9EoOuzF0xoNUnCMNyFsfIqlsuWDdPHCnN0wCdCAw", + ), + ).toBe("https://w3id.org/np/RA43F9EoOuzF0xoNUnCMNyFsfIqlsuWDdPHCnN0wCdCAw"); + }); + + it("strips fragments and named-graph suffixes", () => { + expect( + canonicalNanopubUri( + "https://w3id.org/sciencelive/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8#assertion", + ), + ).toBe( + "https://w3id.org/sciencelive/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8", + ); + }); + + it("returns null for non-nanopub URLs", () => { + expect(canonicalNanopubUri("https://example.com/foo")).toBeNull(); + expect(canonicalNanopubUri("https://doi.org/10.1234/x")).toBeNull(); + }); +}); + +describe("extractTemplateUri", () => { + it("finds wasCreatedFromTemplate via the prefixed form", () => { + const trig = ` + @prefix nt: . + this: nt:wasCreatedFromTemplate . + `; + expect(extractTemplateUri(trig)).toBe( + "https://w3id.org/np/RA2zljn0Nw9SadppOyxZoh-_Rxosslrq-vYG-p9SttnJE", + ); + }); + + it("finds wasCreatedFromTemplate via the full-IRI form", () => { + const trig = ` + this: . + `; + expect(extractTemplateUri(trig)).toBe("https://w3id.org/np/RAX"); + }); + + it("returns null when the predicate is absent", () => { + expect(extractTemplateUri("@prefix : <#> .")).toBeNull(); + }); +}); + +describe("extractTemplateLabel", () => { + it("picks the AssertionTemplate-block label, not earlier vocab labels", () => { + // Regression for the "is a" bug — the first rdfs:label in a FORRT + // template TriG belongs to rdf:type ("is a"), NOT the template itself. + const trig = ` + sub:assertion { + rdf:type rdfs:label "is a" . + rdfs:label "Creative work" . + + sub:assertion a nt:AssertionTemplate; + dct:description "Such a nanopublication declares an outcome."; + rdfs:label "Declaring a replication study outcome according to FORRT"; + nt:hasNanopubLabelPattern "Outcome: \${label}"; + nt:hasStatement sub:st01 . + + sub:outcome rdfs:label "describe the conclusion" . + } + `; + expect(extractTemplateLabel(trig)).toBe( + "Declaring a replication study outcome according to FORRT", + ); + }); + + it("falls back to the 'Template:' pubinfo self-label", () => { + // Some older templates label themselves on `this:` in pubinfo with a + // "Template: " prefix. The prefix is stripped from the returned value. + const trig = ` + sub:pubinfo { + this: dct:created "2024-12-29T16:06:43.753Z"^^xsd:dateTime; + rdfs:label "Template: Declare citations with CiTO" . + } + `; + expect(extractTemplateLabel(trig)).toBe("Declare citations with CiTO"); + }); + + it("falls back to the first non-trivial label", () => { + const trig = ` + rdfs:label "label-with-content" . + `; + expect(extractTemplateLabel(trig)).toBe("label-with-content"); + }); + + it("skips trivial labels (< 8 chars) in the fallback path", () => { + // Without an AssertionTemplate block, the fallback path returns the + // first label of length >= 8. Trivial vocab labels like "is a" are + // skipped; the next eligible label wins. (For full disambiguation we'd + // need an RDF parser to filter by triple subject.) + const trig = ` + rdf:type rdfs:label "is a" . + rdfs:label rdfs:label "has the label" . + `; + expect(extractTemplateLabel(trig)).toBe("has the label"); + }); + + it("returns empty string when no labels at all", () => { + expect(extractTemplateLabel("@prefix : <#> .")).toBe(""); + }); +}); + +describe("extractOrcids", () => { + it("returns unique ORCID URIs", () => { + const trig = ` + orcid:0000-0002-1784-2920 a foaf:Person . + dc:creator , . + `; + expect(extractOrcids(trig).sort()).toEqual([ + "https://orcid.org/0000-0001-7542-0286", + "https://orcid.org/0000-0002-1784-2920", + ]); + }); + + it("returns [] when no ORCIDs present", () => { + expect(extractOrcids("@prefix : <#> .")).toEqual([]); + }); +}); + +describe("extractDois", () => { + it("returns unique DOI URIs across the TriG body", () => { + const trig = ` + a schema:CreativeWork . + dct:isPartOf . + `; + expect(extractDois(trig).sort()).toEqual([ + "https://doi.org/10.1126/science.aax8591", + "https://doi.org/10.5281/zenodo.20113777", + ]); + }); +}); + +describe("extractNanopubUris", () => { + it("returns every canonical nanopub URI mentioned in the TriG", () => { + // Regression for the 14/19 → 19/19 gap — KP's networkGraph doesn't + // materialise the Outcome→Claim edge, but the URI is right there in + // the TriG body. The mining step must surface it. + const trig = ` + a forrt:Outcome; + forrt:targetsClaim ; + nt:wasCreatedFromTemplate . + `; + expect(extractNanopubUris(trig).sort()).toEqual([ + "https://w3id.org/np/RA2zljn0Nw9SadppOyxZoh-_Rxosslrq-vYG-p9SttnJE", + "https://w3id.org/sciencelive/np/RAD19jydIHgfVpRQiA8mqvVUefOd7FFwA4tLIfkXmOJmc", + "https://w3id.org/sciencelive/np/RAVfoa34PLT_3LhfcWLBZ9BQHs43euvrwaTyO9mgk-QcQ", + ]); + }); + + it("returns [] for a TriG with no nanopub URIs", () => { + expect(extractNanopubUris(" a foaf:Person .")).toEqual( + [], + ); + }); +}); + +describe("extractExcerpts", () => { + it("returns the longest unique plain-text literals, top 4 by length", () => { + const trig = ` + sub:x rdfs:label "short" . + sub:y rdfs:label "this is a substantive Outcome conclusion sentence" . + sub:z rdfs:label "this is a substantive Outcome conclusion sentence" . + sub:a rdfs:label "another long literal here for testing extraction" . + `; + const out = extractExcerpts(trig); + // dedup + length-sort + expect(out.length).toBeLessThanOrEqual(4); + expect(out[0]).toBe( + "this is a substantive Outcome conclusion sentence", + ); + expect(out).not.toContain("short"); + }); + + it("skips URI-looking literals", () => { + const trig = ` rdfs:seeAlso "https://example.com/some-long-url" .`; + expect(extractExcerpts(trig)).toEqual([]); + }); +}); + +describe("isTemplateDefinitionLabel", () => { + it("flags template-DEFINITION nanopubs", () => { + expect(isTemplateDefinitionLabel("defining an assertion template")).toBe( + true, + ); + expect(isTemplateDefinitionLabel("Defining a provenance template")).toBe( + true, + ); + expect(isTemplateDefinitionLabel("publishing labels for terms")).toBe(true); + }); + + it("does NOT flag actual chain-step templates", () => { + expect( + isTemplateDefinitionLabel( + "Declaring a replication study outcome according to FORRT", + ), + ).toBe(false); + expect(isTemplateDefinitionLabel("Declare citations with CiTO")).toBe( + false, + ); + expect(isTemplateDefinitionLabel("")).toBe(false); + }); + + it("does not flag declarative labels that start with 'Declaring'", () => { + // Regression — the heuristic uses .startsWith("defining"), so adjacent + // FORRT-style "Declaring …" labels must remain false. + expect(isTemplateDefinitionLabel("Declaring an original claim")).toBe( + false, + ); + }); +}); + +// ============================================================================= +// BOUNDARY + EDGE CASES — round 2 +// ============================================================================= + +describe("canonicalNanopubUri boundaries", () => { + it("matches at exactly 20-character hash (minimum length)", () => { + const uri = "https://w3id.org/np/RA" + "X".repeat(20); + expect(canonicalNanopubUri(uri)).toBe(uri); + }); + + it("rejects 19-character hash (one short of minimum)", () => { + const uri = "https://w3id.org/np/RA" + "X".repeat(19); + expect(canonicalNanopubUri(uri)).toBeNull(); + }); + + it("strips named-graph path suffixes like /Head, /assertion", () => { + const base = + "https://w3id.org/sciencelive/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8"; + expect(canonicalNanopubUri(`${base}/Head`)).toBe(base); + expect(canonicalNanopubUri(`${base}/assertion`)).toBe(base); + }); + + it("accepts http:// in addition to https://", () => { + expect( + canonicalNanopubUri( + "http://w3id.org/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8", + ), + ).toBe( + "http://w3id.org/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8", + ); + }); + + it("returns null for an empty string", () => { + expect(canonicalNanopubUri("")).toBeNull(); + }); + + it("returns null when prefix lacks the `RA` marker", () => { + expect( + canonicalNanopubUri( + "https://w3id.org/np/XX1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8", + ), + ).toBeNull(); + }); + + it("does NOT match if the URL lives at an unrelated host", () => { + expect( + canonicalNanopubUri( + "https://example.org/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8", + ), + ).toBeNull(); + }); +}); + +describe("extractOrcids boundaries", () => { + it("matches an ORCID ending in the X check digit", () => { + const trig = ` dct:creator .`; + expect(extractOrcids(trig)).toEqual([ + "https://orcid.org/0000-0001-2345-678X", + ]); + }); + + it("is case-insensitive on the protocol prefix", () => { + const trig = `HTTPS://orcid.org/0000-0002-1784-2920`; + expect(extractOrcids(trig)).toEqual(["HTTPS://orcid.org/0000-0002-1784-2920"]); + }); +}); + +describe("extractDois boundaries", () => { + it("matches DOIs with various registrant prefixes", () => { + const trig = ` + + + + + `; + const dois = extractDois(trig).sort(); + expect(dois).toContain("https://doi.org/10.1126/science.aax8591"); + expect(dois).toContain("https://doi.org/10.5281/zenodo.20113777"); + expect(dois).toContain("https://doi.org/10.6084/m9.figshare.10058340"); + expect(dois).toContain("https://doi.org/10.15468/dl.3frmsq"); + }); + + it("does NOT match a non-prefix string like just '10.1234/foo'", () => { + expect(extractDois("the doi is 10.1234/foo")).toEqual([]); + }); + + it("stops at a trailing > or whitespace", () => { + // DOIs in TriG appear inside `<...>`. Make sure we don't capture the >. + const trig = ` a schema:CreativeWork .`; + expect(extractDois(trig)).toEqual(["https://doi.org/10.1234/bar"]); + }); +}); + +describe("extractNanopubUris boundaries", () => { + it("matches at 20-character hash, rejects shorter", () => { + const ok = "https://w3id.org/np/RA" + "X".repeat(20); + const short = "https://w3id.org/np/RA" + "X".repeat(19); + const trig = `<${ok}> dct:isPartOf <${short}> .`; + expect(extractNanopubUris(trig)).toEqual([ok]); + }); + + it("does not double-count when the same URI appears multiple times", () => { + const uri = + "https://w3id.org/sciencelive/np/RA1q6c0fG2bMbiozF8Az2UpIfzAzqp8hoVEl6QIzfUpH8"; + const trig = `<${uri}>

<${uri}> ; <${uri}> .`; + expect(extractNanopubUris(trig)).toEqual([uri]); + }); + + it("returns [] for an empty TriG body", () => { + expect(extractNanopubUris("")).toEqual([]); + }); +}); + +describe("extractExcerpts boundaries", () => { + it("matches a literal of exactly 12 chars (minimum)", () => { + const trig = ` rdfs:label "exactlytwelve" .`; + // "exactlytwelve" is 13 chars — over min. Use exact 12. + const trigExact = ` rdfs:label "twelve_chars" .`; + expect(extractExcerpts(trig)).toContain("exactlytwelve"); + expect(extractExcerpts(trigExact)).toContain("twelve_chars"); + }); + + it("rejects literals of 11 chars (one short of minimum)", () => { + const trig = ` rdfs:label "eleven_char" .`; + expect(extractExcerpts(trig)).toEqual([]); + }); + + it("returns at most `top` results, longest-first", () => { + const trig = ` + rdfs:label "twelve_chars" . + rdfs:label "this is a much longer literal value" . + rdfs:label "medium length literal here" . + `; + const out = extractExcerpts(trig, 2); + expect(out).toHaveLength(2); + expect(out[0].length).toBeGreaterThanOrEqual(out[1].length); + }); + + it("returns [] for a TriG with only short literals", () => { + expect(extractExcerpts(` rdfs:label "tiny" .`)).toEqual([]); + }); +}); + +describe("extractTemplateLabel boundaries", () => { + it("handles a malformed AssertionTemplate block (no closing dot)", () => { + // The block regex requires a trailing `.`. If the TriG is malformed, + // we should gracefully fall through to the next strategy instead of + // hanging or matching garbage. Here the block has no `.` so the regex + // is greedy until end-of-string — and we either find the right label + // or fall back to the field-label heuristic. Either is acceptable; + // what matters is that we don't crash. + const trig = `sub:assertion a nt:AssertionTemplate; rdfs:label "no closing dot here"`; + // Result is allowed to be either the captured label or the fallback — + // assertion is "does not throw" and returns a string. + expect(typeof extractTemplateLabel(trig)).toBe("string"); + }); + + it("unescapes escaped quotes inside the captured label", () => { + const trig = `sub:assertion a nt:AssertionTemplate; + rdfs:label "a label with an \\"embedded\\" quote here"; + nt:hasStatement sub:st01 .`; + expect(extractTemplateLabel(trig)).toBe( + `a label with an "embedded" quote here`, + ); + }); + + it("falls back when AssertionTemplate block exists but has no rdfs:label", () => { + const trig = ` + sub:assertion a nt:AssertionTemplate; + nt:hasStatement sub:st01 . + this: rdfs:label "Template: Some Fallback Label" . + `; + expect(extractTemplateLabel(trig)).toBe("Some Fallback Label"); + }); + + it("handles an entirely empty TriG body", () => { + expect(extractTemplateLabel("")).toBe(""); + }); +}); + +describe("extractTemplateUri edge cases", () => { + it("returns the FIRST template URI when multiple are present", () => { + const trig = ` + this: nt:wasCreatedFromTemplate ; + nt:wasCreatedFromTemplate . + `; + expect(extractTemplateUri(trig)).toBe( + "https://w3id.org/np/RA1stTemplate0000000000000000000000000000", + ); + }); +}); + +// ============================================================================= +// PREDICATE-SPECIFIC EXTRACTORS (Phase A — structured FORRT chains) +// ============================================================================= + +describe("extractGithubUrls", () => { + it("finds GitHub URLs and strips /tree/ + /blob/ paths", () => { + const trig = ` + . + . + . + `; + const urls = extractGithubUrls(trig).sort(); + expect(urls).toContain( + "https://github.com/annefou/weatherxbiodiversity-projection", + ); + expect(urls).toContain("https://github.com/annefou/repo"); + // /tree/ and /blob/ stripping should dedupe to the same root + expect(urls.filter((u) => u === "https://github.com/annefou/repo")).toHaveLength(1); + }); + + it("returns [] when no GitHub URLs present", () => { + expect(extractGithubUrls("@prefix : <#> .")).toEqual([]); + }); + + it("strips trailing slashes", () => { + const trig = `

.`; + expect(extractGithubUrls(trig)).toEqual(["https://github.com/annefou/repo"]); + }); +}); + +describe("extractPredicateValue", () => { + it("returns a single-quoted literal object", () => { + const trig = ` "hello" .`; + expect(extractPredicateValue(trig, "http://example.org/p")).toBe("hello"); + }); + + it("returns a triple-quoted literal object (multiline content)", () => { + const trig = ` """multi +line +content""" .`; + expect(extractPredicateValue(trig, "http://example.org/p")).toBe( + "multi\nline\ncontent", + ); + }); + + it("returns a URI object (bracketed)", () => { + const trig = ` .`; + expect(extractPredicateValue(trig, "http://example.org/p")).toBe( + "http://example.org/target", + ); + }); + + it("returns null when the predicate is absent", () => { + expect( + extractPredicateValue(" .", "http://example.org/missing"), + ).toBeNull(); + }); + + it("returns the FIRST match if the predicate repeats", () => { + const trig = ` + . + . + `; + expect(extractPredicateValue(trig, "http://example.org/p")).toBe( + "http://example.org/first", + ); + }); + + it("unescapes backslash-quoted content", () => { + const trig = ` "she said \\"hi\\" loudly" .`; + expect(extractPredicateValue(trig, "http://example.org/p")).toBe( + `she said "hi" loudly`, + ); + }); +}); + +describe("extractPredicateValues", () => { + it("returns ALL matches when the predicate repeats", () => { + const trig = ` + . + . + `; + expect( + extractPredicateValues(trig, "http://purl.org/spar/cito/cites").sort(), + ).toEqual(["https://doi.org/10.1/a", "https://doi.org/10.1/b"]); + }); + + it("returns [] when the predicate is absent", () => { + expect( + extractPredicateValues(" .", "http://example.org/missing"), + ).toEqual([]); + }); +}); + +describe("extractNanopubMeta", () => { + // Models the real nanopub pubinfo block from KP where `this:` carries + // the human-readable label, dct:created datetime, and one-or-more + // dct:creator ORCIDs. + it("extracts label, date, and creators from the `this:` property list", () => { + const trig = ` +sub:pubinfo { + this: dct:created "2026-05-11T19:40:25.904Z"^^xsd:dateTime; + dct:creator orcid:0000-0002-1784-2920, orcid:0000-0001-7542-0286; + dct:license ; + rdfs:label "Outcome: TEI mechanism replicates on Iberian Bombus" . +} + `; + const meta = extractNanopubMeta(trig); + expect(meta.label).toBe( + "Outcome: TEI mechanism replicates on Iberian Bombus", + ); + expect(meta.date).toBe("2026-05-11T19:40:25.904Z"); + // ORCIDs in compact `orcid:0000-…` form aren't matched by the absolute + // URI regex, but the function still finds absolute orcid.org URIs. + expect(meta.creators).toEqual([]); + }); + + it("finds dct:creator ORCIDs when given in absolute URI form", () => { + const trig = ` +sub:pubinfo { + this: dct:created "2026-05-11T19:40:25.904Z"^^xsd:dateTime; + dct:creator , ; + rdfs:label "X" . +} + `; + const meta = extractNanopubMeta(trig); + expect(meta.creators.sort()).toEqual([ + "https://orcid.org/0000-0001-7542-0286", + "https://orcid.org/0000-0002-1784-2920", + ]); + }); + + it("returns empty fields for a TriG without `this:` pubinfo", () => { + expect(extractNanopubMeta("@prefix : <#> .")).toEqual({ + label: "", + date: "", + creators: [], + }); + }); +}); + +describe("extractOutcomeFields", () => { + it("extracts every FORRT Outcome field", () => { + const trig = ` + sub:x a ; + """The mechanism is substrate-robust at fit time but qualified at projection."""; + """Spearman rho = 0.97 at n>=10 after main-effects-only fix."""; + """Three substrates only. One region."""; + ; + ; + ; + ; + "2026-05-09"^^xsd:date . + `; + const fields = extractOutcomeFields(trig); + expect(fields.conclusion).toMatch(/substrate-robust at fit time/); + expect(fields.evidence).toMatch(/Spearman rho = 0.97/); + expect(fields.limitations).toMatch(/Three substrates only/); + expect(fields.validationStatus).toBe("PartiallySupported"); + expect(fields.confidenceLevel).toBe("HighConfidence"); + expect(fields.repository).toBe("https://doi.org/10.5281/zenodo.20113787"); + expect(fields.studyUri).toBe( + "https://w3id.org/sciencelive/np/RAstudy00000000000000000000000000000000000000", + ); + expect(fields.endDate).toBe("2026-05-09"); + }); + + it("returns empty strings when no predicates present", () => { + const out = extractOutcomeFields("@prefix : <#> ."); + expect(out.conclusion).toBe(""); + expect(out.validationStatus).toBe(""); + expect(out.repository).toBe(""); + }); + + it("ignores unknown validation-status values", () => { + const trig = ` .`; + expect(extractOutcomeFields(trig).validationStatus).toBe(""); + }); + + it("falls back to schema:codeRepository when hasOutcomeRepository absent", () => { + const trig = ` .`; + expect(extractOutcomeFields(trig).repository).toBe( + "https://github.com/annefou/repo", + ); + }); +}); + +describe("extractStudyFields", () => { + it("extracts scope, methodology, deviations, discipline, targetsClaim", () => { + const trig = ` + sub:x a ; + "Iberian Bombus, three substrates, SSP3-7.0"; + "GLMM with main-effects-only projection"; + "Drop interaction terms at projection time"; + ; + . + `; + const fields = extractStudyFields(trig); + expect(fields.scope).toBe("Iberian Bombus, three substrates, SSP3-7.0"); + expect(fields.methodology).toBe( + "GLMM with main-effects-only projection", + ); + expect(fields.deviations).toBe( + "Drop interaction terms at projection time", + ); + expect(fields.discipline).toBe("http://www.wikidata.org/entity/Q125928"); + expect(fields.claimUri).toBe( + "https://w3id.org/sciencelive/np/RAclaim00000000000000000000000000000000000000", + ); + }); +}); + +describe("extractClaimFields", () => { + it("derives claimType from the URI suffix like `model_performance-FORRT-Claim`", () => { + const trig = ` + sub:x a ; + . + `; + const fields = extractClaimFields(trig); + expect(fields.claimType).toBe("model_performance"); + expect(fields.aidaStatement).toBe( + "http://purl.org/aida/Projected%20rankings%20vary%20with%20grid", + ); + }); + + it("supports any claim-type subclass", () => { + const trig = ` a .`; + expect(extractClaimFields(trig).claimType).toBe("statistical_significance"); + }); + + it("returns empty claimType when no FORRT-Claim subclass present", () => { + expect(extractClaimFields(" a .").claimType).toBe(""); + }); +}); + +describe("extractQuoteFields", () => { + it("extracts the verbatim quoted text + cited DOI + comment", () => { + const trig = ` + sub:x "Bumble bees are declining across continents."; + ; + "Headline claim sentence anchoring this replication." . + `; + const q = extractQuoteFields(trig); + expect(q.quotedText).toBe("Bumble bees are declining across continents."); + expect(q.citedDoi).toBe("https://doi.org/10.1126/science.aax8591"); + expect(q.comment).toBe( + "Headline claim sentence anchoring this replication.", + ); + }); +}); + +describe("extractAidaFields", () => { + it("decodes the AIDA sentence from a URI path", () => { + const trig = ` + a . + `; + expect(extractAidaFields(trig).sentence).toBe( + "Projected per-species rankings vary with grid resolution.", + ); + }); + + it("returns empty string when no AIDA URI present", () => { + expect(extractAidaFields("@prefix : <#> .").sentence).toBe(""); + }); + + it("does not throw on malformed percent-encoding", () => { + const trig = ` a .`; + // Should fall back to the raw encoded form rather than throw. + expect(typeof extractAidaFields(trig).sentence).toBe("string"); + }); +}); + +describe("extractCitoFields", () => { + it("identifies every CiTO relation used and the cited targets", () => { + const trig = ` + ; + ; + . + `; + const c = extractCitoFields(trig); + expect(c.relations.sort()).toEqual(["cites", "extends", "qualifies"]); + expect(c.citedTargets).toEqual(["https://doi.org/10.1126/science.aax8591"]); + }); + + it("returns empty arrays when no CiTO predicates present", () => { + expect(extractCitoFields(" a .")).toEqual({ + relations: [], + citedTargets: [], + citingEntity: "", + }); + }); + + it("extracts the citing-entity URI (subject of cito: triples)", () => { + const trig = ` + + ; + . + `; + const c = extractCitoFields(trig); + expect(c.citingEntity).toBe( + "https://w3id.org/sciencelive/np/RAoutcome000000000000000000000000000000000", + ); + }); + + it("supports confirms and disputes", () => { + const trig = ` .`; + expect(extractCitoFields(trig).relations).toContain("confirms"); + }); +}); + +describe("extractResearchSoftwareFields", () => { + it("returns GitHub repo, Zenodo DOI, and supports targets", () => { + const trig = ` + a ; + ; + ; + . + `; + const rs = extractResearchSoftwareFields(trig); + expect(rs.repository).toBe( + "https://github.com/annefou/weatherxbiodiversity-projection", + ); + expect(rs.zenodoDoi).toBe("https://doi.org/10.5281/zenodo.20113778"); + expect(rs.supportsTargets).toContain( + "https://w3id.org/sciencelive/np/RAclaim00000000000000000000000000000000000000", + ); + }); +}); + +describe("extractResearchSynthesisFields", () => { + it("returns synthesis/conditions/limitations/recommendations + supported outcomes + topic Qids", () => { + const trig = ` + sub:x a ; + """The mechanism resolves into two empirically distinct claims."""; + """Iberian peninsula, three substrates, SSP3-7.0."""; + """Three substrates only."""; + """Filter to species with at least 10 occupied cells per substrate."""; + , ; + , ; + "2026-05-10"^^xsd:date . + `; + const s = extractResearchSynthesisFields(trig); + expect(s.synthesisDescription).toMatch(/empirically distinct claims/); + expect(s.conditions).toMatch(/Iberian peninsula/); + expect(s.limitations).toMatch(/Three substrates only/); + expect(s.recommendations).toMatch(/at least 10 occupied cells/); + expect(s.supportedByOutcomeUris.sort()).toEqual([ + "https://w3id.org/sciencelive/np/RAout1ABCDEFGHIJKLMNOPQRSTUVWXYZ0000000000", + "https://w3id.org/sciencelive/np/RAout2ABCDEFGHIJKLMNOPQRSTUVWXYZ0000000000", + ]); + expect(s.topicQids.sort()).toEqual(["Q125928", "Q2922293"]); + expect(s.endDate).toBe("2026-05-10"); + }); +}); + +describe("extractExcerpts noise filter", () => { + it("filters base64-encoded blobs (RSA signatures + public keys)", () => { + const trig = ` + rdfs:label "short label is fine if long enough" . + rdfs:label "${"A".repeat(200)}" . + `; + const out = extractExcerpts(trig); + expect(out.every((s) => !/^[A-Za-z0-9+/=]{100,}$/.test(s))).toBe(true); + expect(out).toContain("short label is fine if long enough"); + }); + + it("filters ISO datetime literals", () => { + const trig = ` + rdfs:label "substantive content with real letters" . + dct:created "2026-05-11T19:40:25.904Z" . + `; + expect(extractExcerpts(trig)).toEqual([ + "substantive content with real letters", + ]); + }); + + it("picks up triple-quoted content (FORRT textarea fields)", () => { + const trig = ` +

"""This is a long multi-line + content block typical of FORRT textareas.""" . + `; + const out = extractExcerpts(trig); + expect(out.some((s) => s.includes("multi-line"))).toBe(true); + }); +}); + +// ============================================================================= +// ROUND 3 — gap-filling: internal helpers + adversarial edge cases +// ============================================================================= + +describe("extractPredicateValueAny + extractPredicateValuesAny (direct)", () => { + it("returns full-URI matches when available", () => { + const trig = ` "value-from-full" .`; + expect( + extractPredicateValueAny(trig, "http://example.org/p", "ex:p"), + ).toBe("value-from-full"); + }); + + it("falls back to prefixed form when full URI is absent", () => { + const trig = ` ex:p "value-from-prefixed" .`; + expect( + extractPredicateValueAny(trig, "http://example.org/p", "ex:p"), + ).toBe("value-from-prefixed"); + }); + + it("returns null when neither form matches", () => { + expect( + extractPredicateValueAny("

.", "http://example.org/q", "ex:q"), + ).toBeNull(); + }); + + it("does NOT merge full-URI and prefixed matches (full URI wins)", () => { + // If full-URI form returns ANY result, prefixed-form is skipped entirely. + // Predictable: callers know which form was matched. + const trig = ` + "from-full-1" . + ex:p "from-prefixed" . + `; + const values = extractPredicateValuesAny( + trig, + "http://example.org/p", + "ex:p", + ); + expect(values).toEqual(["from-full-1"]); + }); +}); + +describe("readObjectSegment edge cases (exercised via extractPredicateValue)", () => { + it("handles a triple-quoted literal that is never closed", () => { + // Truncated TriG — the regex should grab everything to end-of-input + // rather than hang or crash. + const trig = ` """starts but never ends`; + const v = extractPredicateValue(trig, "http://example.org/p"); + expect(typeof v === "string" || v === null).toBe(true); + }); + + it("handles a bracketed URI that is never closed", () => { + const trig = ` + extractPredicateValue(trig, "http://example.org/p"), + ).not.toThrow(); + }); + + it("handles a property list that runs to end-of-string with no terminator", () => { + const trig = ` "no-terminator-dot"`; + expect(extractPredicateValue(trig, "http://example.org/p")).toBe( + "no-terminator-dot", + ); + }); + + it("does NOT terminate on a dot inside an embedded quoted string", () => { + const trig = ` "this. has. dots." ; "next" .`; + expect(extractPredicateValue(trig, "http://example.org/p")).toBe( + "this. has. dots.", + ); + expect(extractPredicateValue(trig, "http://example.org/q")).toBe("next"); + }); + + it("handles backslash-escaped quote at the very end of a literal", () => { + const trig = ` "ends with escape \\"" .`; + expect(extractPredicateValue(trig, "http://example.org/p")).toBe( + `ends with escape "`, + ); + }); +}); + +describe("extractGithubUrls — adversarial URL variants", () => { + it("naturally captures only org/repo, dropping /pull/, /issues/ etc.", () => { + // The regex `[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+` only matches two path + // segments after github.com/, so /pull/12 falls off — same effect as + // /tree/ stripping but for free. + const trig = `

.`; + expect(extractGithubUrls(trig)).toEqual(["https://github.com/annefou/repo"]); + }); + + it("rejects raw.githubusercontent.com (not a repo URL)", () => { + const trig = `

.`; + // Current regex is /https?:\/\/github\.com\// — should NOT match raw. + expect(extractGithubUrls(trig)).toEqual([]); + }); + + it("does NOT match git@github.com:user/repo SSH form", () => { + const trig = `

"git@github.com:annefou/repo.git" .`; + expect(extractGithubUrls(trig)).toEqual([]); + }); + + it("returns canonical URL when same repo appears with and without /tree/", () => { + const trig = ` +

. + . + `; + const urls = extractGithubUrls(trig); + // After /tree/ stripping, both collapse to the same URL. + expect(urls).toEqual(["https://github.com/annefou/repo"]); + }); +}); + +describe("extractOutcomeFields — repository fallback chain", () => { + it("falls back to hasRepository when hasOutcomeRepository is absent", () => { + const trig = ` .`; + expect(extractOutcomeFields(trig).repository).toBe( + "https://github.com/annefou/repo", + ); + }); + + it("falls back to schema:codeRepository when neither FORRT predicate present", () => { + const trig = ` .`; + expect(extractOutcomeFields(trig).repository).toBe( + "https://github.com/annefou/repo", + ); + }); + + it("returns the FORRT hasOutcomeRepository when multiple predicates are present (precedence)", () => { + const trig = ` + ; + ; + .`; + expect(extractOutcomeFields(trig).repository).toBe( + "https://doi.org/10.5281/zenodo.1", + ); + }); + + it("accepts a GitHub URL directly as hasOutcomeRepository (not just Zenodo DOIs)", () => { + const trig = ` .`; + expect(extractOutcomeFields(trig).repository).toBe( + "https://github.com/annefou/direct-repo", + ); + }); +}); + +describe("extractClaimFields — variants", () => { + it("returns empty claimType when the only type is bare FORRT-Claim (no subclass)", () => { + const trig = ` a .`; + expect(extractClaimFields(trig).claimType).toBe(""); + }); + + it("accepts claim types containing underscores and digits", () => { + const trig = ` a .`; + expect(extractClaimFields(trig).claimType).toBe("data_quality_v2"); + }); + + it("returns empty aidaStatement when the predicate is missing", () => { + const trig = ` a .`; + expect(extractClaimFields(trig).aidaStatement).toBe(""); + }); +}); + +describe("extractCitoFields — citing entity heuristics", () => { + it("uses the direct-regex path for tight property lists", () => { + // Subject `` directly followed by `` with no `a ;` + // in between — the regex should match the first form. + const trig = ` + .`; + const c = extractCitoFields(trig); + expect(c.citingEntity).toBe( + "https://w3id.org/sciencelive/np/RAouttight000000000000000000000000000000000", + ); + }); + + it("falls back to TriG-mining when subject has an `a ; ` interlude", () => { + // The real-world FORRT outcome-level CiTO shape. + const SELF = + "https://w3id.org/sciencelive/np/RAcitoself0000000000000000000000000000000"; + const SUBJ = + "https://w3id.org/sciencelive/np/RAoutcomeSubj00000000000000000000000000"; + const TPL = "https://w3id.org/np/RAtplCitoX00000000000000000000000000000000"; + const trig = ` +@prefix this: <${SELF}> . +sub:assertion { + <${SUBJ}> a ; + . +} +sub:pubinfo { + this: <${TPL}> . +} +`; + const c = extractCitoFields(trig, { selfUri: SELF, templateUri: TPL }); + expect(c.citingEntity).toBe(SUBJ); + }); + + it("returns empty citingEntity when options are missing and the regex fails", () => { + // Without the self/template options, we can't disambiguate. The direct + // regex still tries, but fails on the `a ;` interlude. So citing + // entity is empty. + const SUBJ = + "https://w3id.org/sciencelive/np/RAouts00000000000000000000000000000000000"; + const trig = ` +<${SUBJ}> a ; + . +`; + const c = extractCitoFields(trig); + // The TriG-only nanopub URI is the subject, and without options we fall + // through to the heuristic which returns the only nanopub URI present. + expect(c.citingEntity).toBe(SUBJ); + }); + + it("does NOT return the self URI as the citing entity", () => { + const SELF = + "https://w3id.org/sciencelive/np/RAself00000000000000000000000000000000000"; + const TPL = "https://w3id.org/np/RAtplY0000000000000000000000000000000000000"; + // Only self URI is in the TriG body — no other nanopub URI. + const trig = ` +<${SELF}> . +<${SELF}> <${TPL}> . +`; + const c = extractCitoFields(trig, { selfUri: SELF, templateUri: TPL }); + // Direct regex matches ` `; my prior fix canonicalises. + expect(c.citingEntity).toBe(SELF); + }); +}); + +describe("extractNanopubMeta — variants", () => { + it("captures xsd:date dct:created (no time component)", () => { + const trig = ` +sub:pubinfo { + this: dct:created "2026-05-11"^^xsd:date; + rdfs:label "X" . +} +`; + expect(extractNanopubMeta(trig).date).toBe("2026-05-11"); + }); + + it("does NOT pick up bare orcid:0000-… prefix form as creator (only absolute URIs)", () => { + // FORRT TriGs sometimes use the compact `orcid:` prefix form. We + // explicitly only accept absolute orcid.org URIs because the prefix + // mapping may differ across publishers. Document this here. + const trig = ` +sub:pubinfo { + this: dct:creator orcid:0000-0002-1784-2920; + rdfs:label "X" . +} +`; + expect(extractNanopubMeta(trig).creators).toEqual([]); + }); + + it("returns the FIRST label across multiple this: blocks", () => { + const trig = ` +sub:assertion { + this: rdfs:label "assertion-label" . +} +sub:pubinfo { + this: rdfs:label "pubinfo-label" . +} +`; + expect(extractNanopubMeta(trig).label).toBe("assertion-label"); + }); +}); + +describe("extractAidaFields — decode edge cases", () => { + it("decodes a plain ASCII AIDA URI", () => { + const trig = ` a .`; + expect(extractAidaFields(trig).sentence).toBe("Simple sentence."); + }); + + it("handles a Unicode-encoded AIDA URI", () => { + const trig = ` a .`; + expect(extractAidaFields(trig).sentence).toBe("Café rules."); + }); + + it("falls back to the raw encoded form on malformed percent-encoding", () => { + const trig = ` a .`; + // decodeURIComponent throws on `%e`; we should fall back to the raw. + expect(extractAidaFields(trig).sentence).toBe("bad%encoding"); + }); +}); + +describe("extractResearchSoftwareFields — variants", () => { + it("returns first GitHub URL when multiple are present", () => { + const trig = ` + ; + .`; + const rs = extractResearchSoftwareFields(trig); + // The repository field comes from the GitHub regex, which returns the + // first match in document order. + expect(rs.repository).toBe("https://github.com/a/first"); + }); + + it("returns empty repository when no GitHub URL present", () => { + const trig = ` .`; + expect(extractResearchSoftwareFields(trig).repository).toBe(""); + }); + + it("captures supportsTargets even when the predicate object is a literal", () => { + // The predicate value extractor accepts literals OR URIs; for cito:supports + // the data is typically a URI but defensively we should not crash on + // string literals either. + const trig = ` "literal-only" .`; + expect(extractResearchSoftwareFields(trig).supportsTargets).toEqual([ + "literal-only", + ]); + }); +}); + +describe("canonicalNanopubUri security boundary", () => { + // The URI flows into `bindUri()` which substitutes it into a SPARQL query + // wrapped in `<…>`. If an attacker could smuggle `>` or `<` or whitespace + // through canonicalNanopubUri, they could break out of the IRI literal and + // inject SPARQL. The regex must reject such inputs at the canonical step. + + it("rejects an attempt to inject closing > and arbitrary text", () => { + expect( + canonicalNanopubUri( + "https://w3id.org/np/RA" + + "X".repeat(20) + + "> ; DROP GRAPH { + expect( + canonicalNanopubUri("https://w3id.org/np/RA1234567 hijklmnop"), + ).toBeNull(); + }); + + it("rejects a quote character inside the URI", () => { + expect( + canonicalNanopubUri('https://w3id.org/np/RA"; SELECT * } #'), + ).toBeNull(); + }); +}); diff --git a/api/src/np/trig.ts b/api/src/np/trig.ts new file mode 100644 index 0000000..de3d222 --- /dev/null +++ b/api/src/np/trig.ts @@ -0,0 +1,763 @@ +/** + * Minimal TriG inspection helpers — regex-based, no full RDF parse. + * + * Cloudflare Workers can run the `n3` parser but it adds ~50 KB to the bundle. + * The Python import-nanopub-chain.py script does mostly regex-level work + * (ORCID + DOI scanning, longest-literal excerpts, single triple match + * for `wasCreatedFromTemplate`), so we mirror that here. + * + * If we ever need full TriG semantics (e.g. resolving blank nodes, walking + * named-graph membership), switch to `n3` then. + */ + +export const ORCID_RE = /https?:\/\/orcid\.org\/0000-[0-9X-]+/gi; +export const DOI_RE = /https?:\/\/doi\.org\/10\.[0-9]+\/[^\s"<>]+/gi; + +const CREATED_FROM_TEMPLATE_RE = + /(?:nt:wasCreatedFromTemplate|)\s+<([^>]+)>/; + +const LABEL_RE = + /(?:rdfs:label||dct:title|)\s+"((?:[^"\\]|\\.)*)"/g; + +// Match string literals with at least 12 chars of content. Excludes newlines +// in the content class so the regex can't span two adjacent literals (closing +// quote of one paired with opening quote of the next captures TriG syntax +// between them as if it were content). +const LITERAL_RE = /"((?:[^"\\\n]|\\.){12,})"/g; + +/** + * Return the template URI used by this nanopub, if `wasCreatedFromTemplate` + * appears in the TriG. The pubinfo graph is where it typically lives; this + * matches the predicate wherever it sits. + */ +export function extractTemplateUri(trig: string): string | null { + const m = CREATED_FROM_TEMPLATE_RE.exec(trig); + return m ? m[1] : null; +} + +/** + * Return a label for a template TriG. + * + * Nanopub-network templates carry many rdfs:label triples — for the template + * itself, for vocabulary terms it references, and for every form field. The + * canonical template label lives in the property list of the AssertionTemplate + * declaration: + * + * sub:assertion a nt:AssertionTemplate; + * dct:description "…"; + * rdfs:label "Declaring a replication study outcome according to FORRT"; + * … + * + * We extract that label directly. Fallbacks: + * 1. The pubinfo's `this: … rdfs:label "Template: …"` self-label + * (some templates use this older convention; strip the "Template: " + * prefix). + * 2. Any longer-than-trivial label, to avoid empty step types. + * + * The Python import-nanopub-chain.py uses rdflib to filter by triple subject; + * we approximate with regex to avoid bundling an RDF parser in the Worker. + */ +export function extractTemplateLabel(trig: string): string { + // Primary: label inside the `a nt:AssertionTemplate; … rdfs:label "…"` block. + // The content class uses alternation to skip over complete quoted strings + // so periods inside string literals (e.g. `dct:description "...types."`) + // don't terminate the capture prematurely. + const tmplBlockMatch = + /a\s+(?:nt:|?)AssertionTemplate\s*;((?:"(?:[^"\\]|\\.)*"|[^.])*)\./.exec( + trig, + ); + if (tmplBlockMatch) { + const blockLabel = /rdfs:label\s+"((?:[^"\\]|\\.)+)"/.exec( + tmplBlockMatch[1], + ); + if (blockLabel) { + const cand = unescapeLiteral(blockLabel[1]).trim(); + if (cand) return cand; + } + } + + // Collect every label for the prefix-based fallbacks. + const labels: string[] = []; + let m: RegExpExecArray | null; + while ((m = LABEL_RE.exec(trig)) !== null) { + labels.push(unescapeLiteral(m[1])); + } + LABEL_RE.lastIndex = 0; + + // Fallback 1: "Template: …" pubinfo self-label. + for (const raw of labels) { + const cand = raw.trim(); + if (cand.startsWith("Template: ")) return cand.slice("Template: ".length); + if (cand.startsWith("Template ")) return cand.slice("Template ".length); + } + // Fallback 2: first non-trivial label. + for (const raw of labels) { + const cand = raw.trim(); + if (cand.length >= 8) return cand; + } + return ""; +} + +/** + * Unique ORCID URIs appearing anywhere in the TriG. + */ +export function extractOrcids(trig: string): string[] { + const out = new Set(); + for (const m of trig.matchAll(ORCID_RE)) out.add(m[0]); + return [...out]; +} + +/** + * Unique DOI URIs appearing anywhere in the TriG. + */ +export function extractDois(trig: string): string[] { + const out = new Set(); + for (const m of trig.matchAll(DOI_RE)) out.add(m[0]); + return [...out]; +} + +// Detect base64-encoded blobs (RSA signatures + public keys). These dominate +// the longest-literals view if not filtered. A blob is long, has no spaces, +// and only contains base64-safe chars. +const BASE64_BLOB_RE = /^[A-Za-z0-9+/=]{100,}$/; + +// Detect ISO-8601 datetimes (XSD dateTime literals on dct:created, etc.). +const ISO_DATETIME_RE = /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}/; + +const TRIPLE_QUOTED_RE = /"""([\s\S]*?)"""/g; + +/** + * The longest few plain-text literals in the TriG — these are the substantive + * content fields (Outcome conclusions, Study scopes, AIDA sentences) per the + * Python script's `parse_node()`. + * + * Filters: URIs, base64 blobs (RSA signature + public key dominate raw + * literals otherwise), ISO timestamps. Also picks up triple-quoted literals + * (FORRT uses `"""…"""` for long textarea fields). + */ +export function extractExcerpts(trig: string, top = 4): string[] { + const seen = new Set(); + const out: string[] = []; + + const tryAdd = (raw: string): void => { + const val = raw.trim(); + if (val.length < 12) return; + if (seen.has(val)) return; + if (val.startsWith("http://") || val.startsWith("https://")) return; + if (BASE64_BLOB_RE.test(val)) return; + if (ISO_DATETIME_RE.test(val)) return; + seen.add(val); + out.push(val); + }; + + for (const m of trig.matchAll(LITERAL_RE)) tryAdd(unescapeLiteral(m[1])); + for (const m of trig.matchAll(TRIPLE_QUOTED_RE)) + tryAdd(unescapeLiteral(m[1])); + + out.sort((a, b) => b.length - a.length); + return out.slice(0, top); +} + +function unescapeLiteral(s: string): string { + return s + .replace(/\\"/g, '"') + .replace(/\\\\/g, "\\") + .replace(/\\n/g, "\n") + .replace(/\\t/g, "\t"); +} + +/** + * Canonical nanopub URI matcher. Returns the bare `…/np/RA…` form (or its + * `…/sciencelive/np/RA…` variant) without fragments or named-graph suffixes. + */ +const NANOPUB_URI_RE = + /^(https?:\/\/w3id\.org\/(?:sciencelive\/)?np\/RA[A-Za-z0-9_-]{20,})/; + +export function canonicalNanopubUri(any: string): string | null { + const m = NANOPUB_URI_RE.exec(any); + return m ? m[1] : null; +} + +/** + * Heuristic — does this template's label look like a template-DEFINITION + * nanopub rather than a chain-step template? Mirrors the Python script's + * "defining a/an … / publishing labels" filter. + */ +export function isTemplateDefinitionLabel(label: string): boolean { + const l = label.toLowerCase(); + return ( + l.startsWith("defining a") || + l.startsWith("defining an") || + l.includes("publishing labels") + ); +} + +const NANOPUB_URI_GLOBAL_RE = + /https?:\/\/w3id\.org\/(?:sciencelive\/)?np\/RA[A-Za-z0-9_-]{20,}/g; + +const GITHUB_RE = /https?:\/\/github\.com\/[A-Za-z0-9_.-]+\/[A-Za-z0-9_.-]+/gi; +const ZENODO_DOI_RE = /https?:\/\/doi\.org\/10\.5281\/zenodo\.\d+/gi; + +const FORRT_TERMS = "https://w3id.org/sciencelive/o/terms/"; +const CITO_PREFIX = "http://purl.org/spar/cito/"; +const SCHEMA_PREFIX = "http://schema.org/"; +const PROV_PREFIX = "http://www.w3.org/ns/prov#"; +const DCT_PREFIX = "http://purl.org/dc/terms/"; + +/** All CiTO relation names that can appear on a CiTO Citation nanopub. */ +export const CITO_RELATIONS = [ + "cites", + "confirms", + "extends", + "qualifies", + "disputes", + "obtainsBackgroundFrom", + "obtainsSupportFrom", + "supports", + "credits", + "discusses", + "documents", +] as const; + +export type CitoRelation = (typeof CITO_RELATIONS)[number]; + +/** All Outcome validation statuses defined by the FORRT vocabulary. */ +export const OUTCOME_VALIDATION_STATUSES = [ + "Validated", + "PartiallySupported", + "Contradicted", + "Inconclusive", + "NotTested", +] as const; + +export type OutcomeValidationStatus = + (typeof OUTCOME_VALIDATION_STATUSES)[number]; + +/** All Outcome confidence levels defined by the FORRT vocabulary. */ +export const OUTCOME_CONFIDENCE_LEVELS = [ + "VeryHighConfidence", + "HighConfidence", + "Moderate", + "LowConfidence", + "VeryLowConfidence", +] as const; + +export type OutcomeConfidenceLevel = + (typeof OUTCOME_CONFIDENCE_LEVELS)[number]; + +/** + * Find every canonical nanopub URI mentioned in a TriG body. KP's + * `npa:refersToNanopub` index does not materialise every chain edge + * (Outcome→Claim, Study→AIDA links are missing from the index), so the + * BFS walks the TriG body too and merges the URIs found there with the + * SPARQL-discovered neighbours. + * + * Returns canonical URIs (stripped of fragments / named-graph suffixes) + * with duplicates removed. + */ +export function extractNanopubUris(trig: string): string[] { + const out = new Set(); + for (const m of trig.matchAll(NANOPUB_URI_GLOBAL_RE)) { + const canon = canonicalNanopubUri(m[0]); + if (canon) out.add(canon); + } + return [...out]; +} + +/** + * Find every GitHub repository URL in the TriG. Strips `/tree/...` and + * `/blob/...` suffixes so only the repo root is returned. Mirrors the + * Python import-nanopub-chain.py `_GITHUB_RE` extraction. + */ +export function extractGithubUrls(trig: string): string[] { + const out = new Set(); + for (const m of trig.matchAll(GITHUB_RE)) { + const url = m[0].split("/tree/")[0].split("/blob/")[0].replace(/\/$/, ""); + out.add(url); + } + return [...out]; +} + +/** + * Pull every object (URI or literal) from a Turtle property-list segment. + * Segments come from `extractPredicateValues` after splitting at predicate + * boundaries; this is the inner extraction over the comma-separated object + * list. Handles ``, `"single-quoted"`, and `"""triple-quoted"""`. + */ +function extractObjectsFromSegment(segment: string): string[] { + const objRe = /<([^>]+)>|"""([\s\S]*?)"""|"((?:[^"\\]|\\.)*)"/g; + const out: string[] = []; + let m: RegExpExecArray | null; + while ((m = objRe.exec(segment)) !== null) { + if (m[1] !== undefined) out.push(m[1]); + else if (m[2] !== undefined) out.push(unescapeLiteral(m[2])); + else if (m[3] !== undefined) out.push(unescapeLiteral(m[3])); + } + return out; +} + +/** + * Walk forward from `start` in `trig` until the end of the current Turtle + * statement (an un-quoted `;` or `.`), respecting `"…"` and `"""…"""` + * literals. Returns the text segment between `start` and the terminator + * (exclusive). This is the engine behind `extractPredicateValue(s)` — + * needed because the segment may carry a comma-separated object list like + * `cito:isSupportedBy , , ;`. + */ +function readObjectSegment(trig: string, start: number): string { + let i = start; + let segment = ""; + while (i < trig.length) { + // Triple-quoted literal — copy verbatim through the closing `"""`. + if (trig.startsWith('"""', i)) { + const end = trig.indexOf('"""', i + 3); + if (end === -1) { + segment += trig.slice(i); + return segment; + } + segment += trig.slice(i, end + 3); + i = end + 3; + continue; + } + // Single-quoted literal — copy verbatim through the closing `"`. + if (trig[i] === '"') { + segment += '"'; + i++; + while (i < trig.length && trig[i] !== '"') { + if (trig[i] === "\\" && i + 1 < trig.length) { + segment += trig[i] + trig[i + 1]; + i += 2; + continue; + } + segment += trig[i]; + i++; + } + if (i < trig.length) { + segment += '"'; + i++; + } + continue; + } + // Bracketed URI like `` — copy verbatim through + // the closing `>`. Dots inside the URI must NOT terminate the segment. + if (trig[i] === "<") { + const end = trig.indexOf(">", i + 1); + if (end === -1) { + segment += trig.slice(i); + return segment; + } + segment += trig.slice(i, end + 1); + i = end + 1; + continue; + } + if (trig[i] === ";" || trig[i] === ".") return segment; + segment += trig[i]; + i++; + } + return segment; +} + +/** + * Look up the OBJECT (literal or URI) of a triple in the TriG that uses the + * given predicate URI. Returns the first match. Handles ``, + * `"single-quoted"`, and `"""triple-quoted"""` objects. Returns null when + * the predicate is absent. + */ +export function extractPredicateValue( + trig: string, + predicateUri: string, +): string | null { + const values = extractPredicateValues(trig, predicateUri); + return values.length > 0 ? values[0] : null; +} + +/** + * Same as `extractPredicateValue` but returns every match — handles both + * the predicate repeating in the TriG AND the predicate carrying a + * comma-separated object list like `cito:isSupportedBy , `. + * + * `predicate` accepts either: + * - Full URI form `"http://example.org/p"` — matches `` + * - Prefixed form `"rdfs:label"` — matches `rdfs:label` literally (handy + * for pubinfo blocks where the resolver emits prefixed predicates). + */ +export function extractPredicateValues( + trig: string, + predicate: string, +): string[] { + const escaped = predicate.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + const predRe = /^https?:\/\//.test(predicate) + ? new RegExp(`<${escaped}>\\s+`, "g") + : new RegExp(`(?:^|[\\s;{])${escaped}\\s+`, "g"); + const out: string[] = []; + let m: RegExpExecArray | null; + while ((m = predRe.exec(trig)) !== null) { + const start = m.index + m[0].length; + const segment = readObjectSegment(trig, start); + for (const obj of extractObjectsFromSegment(segment)) out.push(obj); + // Advance past the segment to avoid re-matching the same predicate. + predRe.lastIndex = start + segment.length; + } + return out; +} + +/** + * Try a full URI form first, then a prefixed-form fallback. Returns the + * first non-empty match. Useful when a predicate appears in either + * representation depending on which TriG section it lives in. + */ +export function extractPredicateValueAny( + trig: string, + fullUri: string, + prefixedForm: string, +): string | null { + const a = extractPredicateValues(trig, fullUri); + if (a.length > 0) return a[0]; + const b = extractPredicateValues(trig, prefixedForm); + return b.length > 0 ? b[0] : null; +} + +export function extractPredicateValuesAny( + trig: string, + fullUri: string, + prefixedForm: string, +): string[] { + const a = extractPredicateValues(trig, fullUri); + if (a.length > 0) return a; + return extractPredicateValues(trig, prefixedForm); +} + +/** + * Pubinfo-level metadata for a nanopub: rdfs:label of `this:`, dct:created, + * dct:creator (multiple ORCIDs possible). + * + * The pubinfo graph has `this: rdfs:label "…"; dct:created "…"; dct:creator + * ` (turtle property list) so we match on `this:` AS subject. + */ +export type NanopubMeta = { + label: string; + date: string; + creators: string[]; +}; + +export function extractNanopubMeta(trig: string): NanopubMeta { + // Pubinfo (and sometimes assertion) writes these predicates in prefixed + // form (`rdfs:label`, `dct:created`, `dct:creator`) when the W3ID resolver + // emits the TriG. We accept either prefixed OR full-URI form and take + // the first match for label/date (single-valued) and every match for + // creator (ORCID list). + const label = + extractPredicateValueAny( + trig, + "http://www.w3.org/2000/01/rdf-schema#label", + "rdfs:label", + ) ?? ""; + const date = + extractPredicateValueAny( + trig, + "http://purl.org/dc/terms/created", + "dct:created", + ) ?? ""; + const creatorObjects = extractPredicateValuesAny( + trig, + "http://purl.org/dc/terms/creator", + "dct:creator", + ); + const creators = new Set(); + for (const obj of creatorObjects) { + if (ORCID_RE.test(obj)) creators.add(obj); + ORCID_RE.lastIndex = 0; + } + return { + label: label.trim(), + date: date.trim(), + creators: [...creators], + }; +} + +// ============================================================================= +// Per-template structured extractors +// ============================================================================= + +export type OutcomeFields = { + conclusion: string; + evidence: string; + limitations: string; + validationStatus: OutcomeValidationStatus | ""; + confidenceLevel: OutcomeConfidenceLevel | ""; + repository: string; + studyUri: string; + endDate: string; +}; + +export function extractOutcomeFields(trig: string): OutcomeFields { + const validationFull = extractPredicateValue( + trig, + `${FORRT_TERMS}hasValidationStatus`, + ); + const confidenceFull = extractPredicateValue( + trig, + `${FORRT_TERMS}hasConfidenceLevel`, + ); + return { + conclusion: + extractPredicateValue(trig, `${FORRT_TERMS}hasConclusionDescription`) ?? + "", + evidence: + extractPredicateValue(trig, `${FORRT_TERMS}hasEvidenceDescription`) ?? "", + limitations: + extractPredicateValue(trig, `${FORRT_TERMS}hasLimitationsDescription`) ?? + "", + validationStatus: stripVocabPrefix( + validationFull, + OUTCOME_VALIDATION_STATUSES, + ), + confidenceLevel: stripVocabPrefix( + confidenceFull, + OUTCOME_CONFIDENCE_LEVELS, + ), + repository: + extractPredicateValue(trig, `${FORRT_TERMS}hasOutcomeRepository`) ?? + extractPredicateValue(trig, `${FORRT_TERMS}hasRepository`) ?? + extractPredicateValue(trig, `${SCHEMA_PREFIX}codeRepository`) ?? + "", + studyUri: + extractPredicateValue(trig, `${FORRT_TERMS}isOutcomeOf`) ?? "", + endDate: extractPredicateValue(trig, `${SCHEMA_PREFIX}endDate`) ?? "", + }; +} + +export type StudyFields = { + scope: string; + methodology: string; + deviations: string; + discipline: string; + claimUri: string; +}; + +export function extractStudyFields(trig: string): StudyFields { + return { + scope: + extractPredicateValue(trig, `${FORRT_TERMS}hasScopeDescription`) ?? "", + methodology: + extractPredicateValue(trig, `${FORRT_TERMS}hasMethodologyDescription`) ?? + "", + deviations: + extractPredicateValue(trig, `${FORRT_TERMS}hasDeviationDescription`) ?? + "", + discipline: + extractPredicateValue(trig, `${FORRT_TERMS}hasDiscipline`) ?? "", + claimUri: + extractPredicateValue(trig, `${FORRT_TERMS}targetsClaim`) ?? "", + }; +} + +export type ClaimFields = { + claimType: string; + aidaStatement: string; +}; + +/** + * The Claim's type comes from a subclass URI like + * `…/terms/model_performance-FORRT-Claim`. We parse the type prefix off the + * URI suffix. The AIDA URI it cites lives in `asAidaStatement`. + */ +export function extractClaimFields(trig: string): ClaimFields { + let claimType = ""; + const typeRe = new RegExp( + `<${FORRT_TERMS.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}([A-Za-z0-9_]+)-FORRT-Claim>`, + ); + const tm = typeRe.exec(trig); + if (tm) claimType = tm[1]; + return { + claimType, + aidaStatement: + extractPredicateValue(trig, `${FORRT_TERMS}asAidaStatement`) ?? "", + }; +} + +export type QuoteFields = { + quotedText: string; + citedDoi: string; + comment: string; +}; + +export function extractQuoteFields(trig: string): QuoteFields { + const dois = extractDois(trig); + return { + quotedText: + extractPredicateValue(trig, `${CITO_PREFIX}hasQuotedText`) ?? "", + citedDoi: dois[0] ?? "", + comment: + extractPredicateValue( + trig, + "http://www.w3.org/2000/01/rdf-schema#comment", + ) ?? "", + }; +} + +export type AidaFields = { + sentence: string; +}; + +/** + * AIDA nanopubs encode the sentence as the URI of an AIDA-Sentence subject: + * ``. We decode + * that URI suffix back into the plain sentence. + */ +export function extractAidaFields(trig: string): AidaFields { + const aidaRe = /]+)>/; + const m = aidaRe.exec(trig); + if (!m) return { sentence: "" }; + try { + return { sentence: decodeURIComponent(m[1]).trim() }; + } catch { + return { sentence: m[1] }; + } +} + +export type CitoFields = { + relations: CitoRelation[]; + citedTargets: string[]; + /** + * The URI that appears as the SUBJECT of the CiTO triples — i.e. the + * "citing entity". For Science Live FORRT CiTO Citation nanopubs this is + * the Outcome URI that the CiTO is attached to. Empty when not found. + */ + citingEntity: string; +}; + +export function extractCitoFields( + trig: string, + options: { selfUri?: string; templateUri?: string } = {}, +): CitoFields { + const relations: CitoRelation[] = []; + const citedTargets = new Set(); + for (const rel of CITO_RELATIONS) { + const values = extractPredicateValues(trig, `${CITO_PREFIX}${rel}`); + if (values.length > 0) { + relations.push(rel); + for (const v of values) citedTargets.add(v); + } + } + + // Find the SUBJECT (citing entity) of the CiTO triples. Two strategies: + // 1) Direct regex `\s+` — works for compact property lists. + // 2) "Only non-self non-template nanopub URI in the TriG" — works for the + // common FORRT outcome-level CiTO shape ` a ; ` + // where the rdf:type triple sits between subject and CiTO predicate. + let citingEntity = ""; + for (const rel of CITO_RELATIONS) { + const escaped = `${CITO_PREFIX}${rel}`.replace( + /[.*+?^${}()|[\]\\]/g, + "\\$&", + ); + const re = new RegExp(`<([^>]+)>\\s+<${escaped}>`); + const m = re.exec(trig); + if (m) { + citingEntity = canonicalNanopubUri(m[1]) ?? m[1]; + break; + } + } + if (!citingEntity && relations.length > 0) { + const self = options.selfUri + ? canonicalNanopubUri(options.selfUri) ?? options.selfUri + : ""; + const tmpl = options.templateUri ?? ""; + for (const u of extractNanopubUris(trig)) { + if (u === self) continue; + if (u === tmpl) continue; + citingEntity = u; + break; + } + } + + return { relations, citedTargets: [...citedTargets], citingEntity }; +} + +export type ResearchSoftwareFields = { + repository: string; + zenodoDoi: string; + supportsTargets: string[]; +}; + +export function extractResearchSoftwareFields( + trig: string, +): ResearchSoftwareFields { + const githubs = extractGithubUrls(trig); + let zenodo = ""; + for (const m of trig.matchAll(ZENODO_DOI_RE)) { + zenodo = m[0]; + break; + } + return { + repository: githubs[0] ?? "", + zenodoDoi: zenodo, + supportsTargets: extractPredicateValues(trig, `${CITO_PREFIX}supports`), + }; +} + +/** + * Strip a FORRT vocab namespace prefix from a full URI and validate it + * against a known set. Returns "" if the URI doesn't match any known value. + */ +function stripVocabPrefix( + fullUri: string | null, + known: readonly T[], +): T | "" { + if (!fullUri) return ""; + const short = fullUri.split("/").pop() ?? ""; + return (known as readonly string[]).includes(short) ? (short as T) : ""; +} + +export type ResearchSynthesisFields = { + synthesisDescription: string; + conditions: string; + limitations: string; + recommendations: string; + supportedByOutcomeUris: string[]; + topicQids: string[]; + endDate: string; +}; + +/** + * Research Synthesis is the apex of a multi-chain FORRT constellation. Its + * `cito:isSupportedBy` predicate enumerates the Outcome nanopubs that the + * synthesis aggregates — this is the canonical Synthesis → Outcome linkage. + */ +export function extractResearchSynthesisFields( + trig: string, +): ResearchSynthesisFields { + const supportedBy = extractPredicateValues( + trig, + `${CITO_PREFIX}isSupportedBy`, + ); + const subjectUris = extractPredicateValues(trig, `${DCT_PREFIX}subject`); + const wikidataQs = subjectUris + .filter((u) => u.startsWith("http://www.wikidata.org/entity/Q")) + .map((u) => u.split("/").pop() ?? ""); + return { + synthesisDescription: + extractPredicateValue(trig, `${FORRT_TERMS}hasSynthesisDescription`) ?? + "", + conditions: + extractPredicateValue(trig, `${FORRT_TERMS}hasConditionsDescription`) ?? + "", + limitations: + extractPredicateValue(trig, `${FORRT_TERMS}hasLimitationsDescription`) ?? + "", + recommendations: + extractPredicateValue( + trig, + `${FORRT_TERMS}hasRecommendationDescription`, + ) ?? "", + supportedByOutcomeUris: supportedBy + .map((u) => canonicalNanopubUri(u)) + .filter((u): u is string => u !== null), + topicQids: wikidataQs, + endDate: extractPredicateValue(trig, `${SCHEMA_PREFIX}endDate`) ?? "", + }; +} + +/** Suppress PROV_PREFIX import warning while keeping the constant available. */ +void PROV_PREFIX; diff --git a/api/vitest.config.ts b/api/vitest.config.ts new file mode 100644 index 0000000..29c2f3b --- /dev/null +++ b/api/vitest.config.ts @@ -0,0 +1,15 @@ +import path from "path"; +import { defineConfig } from "vitest/config"; + +export default defineConfig({ + resolve: { + alias: { + "@": path.resolve(__dirname, "./src"), + }, + }, + test: { + environment: "node", + include: ["src/**/*.test.ts"], + testTimeout: 10_000, + }, +}); diff --git a/frontend/src/lib/queries/references-from.rq b/frontend/src/lib/queries/references-from.rq new file mode 100644 index 0000000..eff7013 --- /dev/null +++ b/frontend/src/lib/queries/references-from.rq @@ -0,0 +1,45 @@ +# Get all nanopubs that the specified nanopub references (the inverse +# direction of nanopub-references.rq). Uses the KP admin networkGraph +# which materialises npa:refersToNanopub edges from each nanopub's +# triples — so finding what a nanopub points at is a single hop. +# +# Placeholder: `?_nanopubUri` - URI: the URI of the *source* nanopub +# (i.e., the nanopub whose outgoing references we want). +# +# Returns the referenced (downstream) nanopub plus its metadata, with +# the same filtering and ordering as the upstream nanopub-references.rq: +# excludes invalidated / superseded nanopubs, comments, approvals. + +prefix rdfs: +prefix np: +prefix npa: +prefix npx: +prefix dct: +prefix nt: + +select ?np ?label ?date ?creator ?template where { + # Find nanopubs that the entry refers to (the "downstream" direction). + graph npa:networkGraph { + ?_nanopubUri npa:refersToNanopub ?np . + } + + # Same metadata + invalidation/comment/approval filtering as + # nanopub-references.rq. + graph npa:graph { + ?np npa:hasValidSignatureForPublicKeyHash ?pubkey . + filter not exists { ?npx npx:invalidates ?np ; npa:hasValidSignatureForPublicKeyHash ?pubkey . } + filter not exists { ?np npx:invalidates ?_nanopubUri . } + optional { ?np rdfs:label ?label . } + ?np np:hasAssertion ?assertion ; + dct:created ?date ; + dct:creator ?creator . + } + + filter not exists { graph ?assertion { ?_nanopubUri rdfs:comment ?_s . } } + filter not exists { graph ?assertion { ?approver npx:approvesOf ?_nanopubUri . } } + filter not exists { graph ?assertion { ?disapprover npx:disapprovesOf ?_nanopubUri . } } + + optional { graph npa:networkGraph { ?np nt:wasCreatedFromTemplate ?template . } } +} +order by desc(?date) +limit 100