From 428a44755bba992cd381c0c80bebbaf565d891f9 Mon Sep 17 00:00:00 2001 From: Joe McKnight Date: Fri, 15 May 2026 16:33:35 +0100 Subject: [PATCH 1/2] test: add failing tests for source_file sentinel contract Pin the contract before implementing: - AST extractor must emit source_file="" for cross-corpus stub nodes (currently emits "") - Validator must reject empty/None source_file as missing (currently only checks key presence) - skill.md prompt must document the sentinel so the LLM extractor stays aligned with the AST extractor 5 new failing tests + 1 fixture (sample_external_base.py). --- tests/fixtures/sample_external_base.py | 10 ++++ tests/test_extract.py | 40 +++++++++++++++ tests/test_skill_prompt.py | 48 +++++++++++++++++ tests/test_validate.py | 71 ++++++++++++++++++++++++++ 4 files changed, 169 insertions(+) create mode 100644 tests/fixtures/sample_external_base.py create mode 100644 tests/test_skill_prompt.py diff --git a/tests/fixtures/sample_external_base.py b/tests/fixtures/sample_external_base.py new file mode 100644 index 000000000..28f98bea7 --- /dev/null +++ b/tests/fixtures/sample_external_base.py @@ -0,0 +1,10 @@ +"""Fixture: a class that inherits from a base defined outside this corpus. + +The base class `ExternalBase` is not defined in any other fixture, so the AST +extractor must emit it as a stub node so the `inherits` edge survives. +""" + + +class LocalClass(ExternalBase): # noqa: F821 - intentionally undefined; this is a parse-only fixture + def method(self): + return 1 diff --git a/tests/test_extract.py b/tests/test_extract.py index 2ae63eaed..2104f9d09 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -142,3 +142,43 @@ def test_calls_deduplication(): result = extract_python(FIXTURES / "sample_calls.py") call_pairs = [(e["source"], e["target"]) for e in result["edges"] if e["relation"] == "calls"] assert len(call_pairs) == len(set(call_pairs)), "Duplicate calls edges found" + + +# --------------------------------------------------------------------------- +# source_file contract: stub nodes for cross-corpus symbols +# --------------------------------------------------------------------------- + +def test_external_base_emits_sentinel_source_file(): + """ + When a class inherits from a symbol not defined in the parsed corpus + (e.g. a framework base class), the extractor adds a stub node so the + `inherits` edge survives. That stub MUST carry source_file="" + rather than an empty string or None, so downstream validators can + distinguish "outside the corpus" from "extraction bug". + """ + result = extract_python(FIXTURES / "sample_external_base.py") + stubs = [n for n in result["nodes"] if n["label"] == "ExternalBase"] + assert len(stubs) == 1, f"Expected exactly one stub for ExternalBase, got {len(stubs)}" + assert stubs[0]["source_file"] == "", ( + f"External-symbol stub must use the '' sentinel, " + f"got {stubs[0]['source_file']!r}" + ) + + +def test_external_base_stub_is_never_empty_string(): + """Regression: no node may emit source_file as an empty string.""" + result = extract_python(FIXTURES / "sample_external_base.py") + for n in result["nodes"]: + assert n["source_file"] != "", f"Node {n['id']} has empty source_file" + assert n["source_file"] is not None, f"Node {n['id']} has None source_file" + + +def test_inherits_edge_survives_for_external_base(): + """The whole point of the stub: the `inherits` edge must still be emitted.""" + result = extract_python(FIXTURES / "sample_external_base.py") + inherits = [e for e in result["edges"] if e["relation"] == "inherits"] + assert any( + result_node["label"] == "ExternalBase" and result_node["id"] == edge["target"] + for edge in inherits + for result_node in result["nodes"] + ), "Expected an inherits edge whose target is the ExternalBase stub" diff --git a/tests/test_skill_prompt.py b/tests/test_skill_prompt.py new file mode 100644 index 000000000..b3be23034 --- /dev/null +++ b/tests/test_skill_prompt.py @@ -0,0 +1,48 @@ +"""Tests for the extraction-subagent prompt template embedded in skill.md. + +The skill.md file is the canonical instruction set the orchestrator hands to +extraction subagents. These tests pin the parts of the prompt that govern the +output schema — specifically that `source_file` is required on every node and +edge, so the validator does not later have to deal with missing values. +""" +from pathlib import Path + +SKILL_MD = Path(__file__).parent.parent / "graphify" / "skill.md" + + +def _skill_text() -> str: + return SKILL_MD.read_text(encoding="utf-8") + + +def test_skill_md_exists(): + assert SKILL_MD.exists(), f"skill.md not found at {SKILL_MD}" + + +def test_prompt_marks_source_file_required(): + """ + The extraction prompt must explicitly state that `source_file` is a + required field on every node and edge. Without this, the LLM elides it + on rationale/document nodes and the validator can't distinguish + "outside the corpus" from "LLM forgot to fill the field". + """ + text = _skill_text() + # Look for a clear "required" statement near source_file in the prompt body. + # We require the literal phrase rather than scanning the JSON schema example + # so the requirement is human-readable to the subagent. + assert "source_file" in text and "required" in text.lower(), ( + "skill.md prompt should describe source_file as required" + ) + + +def test_prompt_documents_external_sentinel(): + """ + The prompt should mention the `` sentinel so subagents know + they may use it for cross-corpus symbols rather than leaving the field + empty. This keeps the AST-extractor contract and the LLM-extractor + contract aligned. + """ + text = _skill_text() + assert "" in text, ( + "skill.md prompt should mention the '' sentinel " + "for cross-corpus symbols" + ) diff --git a/tests/test_validate.py b/tests/test_validate.py index 396e90c8c..913dbdb11 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -85,3 +85,74 @@ def test_assert_valid_raises_on_errors(): def test_assert_valid_passes_silently(): assert_valid(VALID) # should not raise + + +# --------------------------------------------------------------------------- +# source_file contract: sentinel + None/empty still flagged +# --------------------------------------------------------------------------- + +def test_external_sentinel_source_file_accepted(): + """ + source_file="" is a contract-level sentinel meaning "this + symbol lives outside the parsed corpus" (e.g. a framework base class). + The validator must accept it as a valid value — otherwise every + framework type shows up as a false-positive extraction issue. + """ + data = { + "nodes": [ + {"id": "n1", "label": "MyClass", "file_type": "code", "source_file": "my.py"}, + {"id": "n2", "label": "FrameworkBase", "file_type": "code", "source_file": ""}, + ], + "edges": [ + {"source": "n1", "target": "n2", "relation": "inherits", + "confidence": "EXTRACTED", "source_file": "my.py", "weight": 1.0}, + ], + } + assert validate_extraction(data) == [] + + +def test_empty_source_file_flagged_as_missing(): + """ + Empty-string source_file is still a real bug (likely an LLM omitting + a required field). The validator must keep flagging it — the + sentinel is the ONLY non-path string that's allowed. + """ + data = { + "nodes": [ + {"id": "n1", "label": "Doc", "file_type": "document", "source_file": ""}, + ], + "edges": [], + } + errors = validate_extraction(data) + assert any("source_file" in e and "n1" in e for e in errors), ( + f"Expected empty source_file to be flagged, got errors: {errors}" + ) + + +def test_none_source_file_flagged_as_missing(): + """None source_file is still flagged as missing (same reason as empty string).""" + data = { + "nodes": [ + {"id": "n1", "label": "Doc", "file_type": "document", "source_file": None}, + ], + "edges": [], + } + errors = validate_extraction(data) + assert any("source_file" in e and "n1" in e for e in errors), ( + f"Expected None source_file to be flagged, got errors: {errors}" + ) + + +def test_edge_external_sentinel_source_file_accepted(): + """Same contract for edges: is valid.""" + data = { + "nodes": [ + {"id": "n1", "label": "A", "file_type": "code", "source_file": "a.py"}, + {"id": "n2", "label": "B", "file_type": "code", "source_file": "b.py"}, + ], + "edges": [ + {"source": "n1", "target": "n2", "relation": "calls", + "confidence": "INFERRED", "source_file": "", "weight": 1.0}, + ], + } + assert validate_extraction(data) == [] From b2c9c611bee8ae65feebc7098e17b1cf03b3aabb Mon Sep 17 00:00:00 2001 From: Joe McKnight Date: Fri, 15 May 2026 16:37:59 +0100 Subject: [PATCH 2/2] feat: emit '' sentinel for cross-corpus source_file Pin the source_file contract so cross-corpus references (framework base classes referenced via inheritance, etc.) are distinguishable from extraction bugs: - AST extractor: external-base stub nodes now use source_file="" instead of an empty string. Module docstring documents the contract: source_file is either a path or the literal string "". - Validator: empty string and None for source_file are now flagged as missing required field (previously only flagged when the key itself was absent). The "" sentinel is accepted as valid. - skill.md (LLM extraction prompt): source_file is explicitly called out as REQUIRED on every node and edge, including document/rationale-style nodes. Prompt now references the "" sentinel so the LLM extractor stays aligned with the AST extractor. Why: on a polyglot corpus, every `graphify update` emits hundreds of "missing required field 'source_file'" warnings. Some are real extraction bugs (LLM omitted the field); most are framework types referenced via inheritance but never defined locally. The contract change separates real bugs from cross-corpus references at the schema level so downstream consumers can filter cleanly. --- graphify/extract.py | 25 ++++++++++++++++++++++--- graphify/skill.md | 10 ++++++++++ graphify/validate.py | 27 +++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/graphify/extract.py b/graphify/extract.py index c5ceddc00..008009a16 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -1,4 +1,20 @@ -"""Deterministic structural extraction from Python code using tree-sitter. Outputs nodes+edges dicts.""" +"""Deterministic structural extraction from source code using tree-sitter. Outputs nodes+edges dicts. + +source_file contract +-------------------- +Every emitted node and edge carries a ``source_file`` field. The value is one of: + +* a relative or absolute path string pointing at the file that produced the node/edge, or +* the literal string ``""`` — a sentinel meaning "this symbol lives outside the + parsed corpus" (e.g. a framework base class referenced via inheritance but never defined + in any input file). Stub nodes for cross-corpus symbols use this sentinel so the + ``inherits`` / ``references`` edge survives without misleading downstream validators + into thinking the extractor lost the source. + +``""`` and ``None`` are NOT valid values. The validator treats them as missing fields so +the LLM-extraction path can be caught when it omits the field. See +``graphify/validate.py``. +""" from __future__ import annotations import json import re @@ -103,14 +119,17 @@ def walk(node, parent_class_nid: str | None = None) -> None: # Try same-file base first; fall back to a bare stub base_nid = _make_id(stem, base) if base_nid not in seen_ids: - # External or forward-declared base - add a stub so edge survives + # External or forward-declared base - add a stub so edge survives. + # The stub uses source_file="" (see module docstring) + # so downstream validators can tell "lives outside the corpus" + # apart from "extractor dropped the field". base_nid = _make_id(base) if base_nid not in seen_ids: nodes.append({ "id": base_nid, "label": base, "file_type": "code", - "source_file": "", + "source_file": "", "source_location": "", }) seen_ids.add(base_nid) diff --git a/graphify/skill.md b/graphify/skill.md index c9683689f..5eeb10895 100644 --- a/graphify/skill.md +++ b/graphify/skill.md @@ -232,6 +232,16 @@ confidence_score rules: Reasonable but not certain: 0.6-0.7. Weak inference: 0.4-0.5. - AMBIGUOUS edges: score 0.1-0.3 +source_file is REQUIRED on every node and every edge. Never omit it, never emit + an empty string, never emit null. Two valid forms: +- A relative path to the file the node/edge came from (the normal case). +- The literal string "" — use this when a node represents a symbol + that lives outside the parsed corpus (e.g. a framework base class referenced + from local code but never defined locally). This sentinel keeps cross-corpus + references visible without making them look like extraction bugs. +This applies to every file_type, including `document` rationale-style nodes — + if the rationale comes from a doc, source_file is that doc's path. + Output exactly this JSON (no other text): {"nodes":[{"id":"filestem_entityname","label":"Human Readable Name","file_type":"code|document|paper|image","source_file":"relative/path","source_location":null,"source_url":null,"captured_at":null,"author":null,"contributor":null}],"edges":[{"source":"node_id","target":"node_id","relation":"calls|implements|references|cites|conceptually_related_to|shares_data_with|semantically_similar_to","confidence":"EXTRACTED|INFERRED|AMBIGUOUS","confidence_score":1.0,"source_file":"relative/path","source_location":null,"weight":1.0}],"hyperedges":[{"id":"snake_case_id","label":"Human Readable Label","nodes":["node_id1","node_id2","node_id3"],"relation":"participate_in|implement|form","confidence":"EXTRACTED|INFERRED","confidence_score":0.75,"source_file":"relative/path"}],"input_tokens":0,"output_tokens":0} ``` diff --git a/graphify/validate.py b/graphify/validate.py index 39434091c..aa17fbe4a 100644 --- a/graphify/validate.py +++ b/graphify/validate.py @@ -6,6 +6,18 @@ REQUIRED_NODE_FIELDS = {"id", "label", "file_type", "source_file"} REQUIRED_EDGE_FIELDS = {"source", "target", "relation", "confidence", "source_file"} +# Sentinel value for `source_file` meaning "this symbol lives outside the parsed +# corpus" (e.g. a framework base class referenced via inheritance but never +# defined locally). See graphify/extract.py module docstring for the full +# contract. The validator accepts this sentinel as a valid source_file so the +# real LLM-omission bug (empty / None) stays visible. +EXTERNAL_SENTINEL = "" + + +def _is_valid_source_file(value: object) -> bool: + """A valid source_file is a non-empty string (a path or the sentinel).""" + return isinstance(value, str) and value != "" + def validate_extraction(data: dict) -> list[str]: """ @@ -30,6 +42,15 @@ def validate_extraction(data: dict) -> list[str]: for field in REQUIRED_NODE_FIELDS: if field not in node: errors.append(f"Node {i} (id={node.get('id', '?')!r}) missing required field '{field}'") + elif field == "source_file" and not _is_valid_source_file(node[field]): + # Empty string or None still counts as missing — distinguishes + # "outside the corpus" (use the "" sentinel) from + # "extractor bug / LLM forgot the field". + errors.append( + f"Node {i} (id={node.get('id', '?')!r}) missing required field " + f"'source_file' (got {node[field]!r}; use '{EXTERNAL_SENTINEL}' " + f"for cross-corpus symbols)" + ) if "file_type" in node and node["file_type"] not in VALID_FILE_TYPES: errors.append( f"Node {i} (id={node.get('id', '?')!r}) has invalid file_type " @@ -50,6 +71,12 @@ def validate_extraction(data: dict) -> list[str]: for field in REQUIRED_EDGE_FIELDS: if field not in edge: errors.append(f"Edge {i} missing required field '{field}'") + elif field == "source_file" and not _is_valid_source_file(edge[field]): + errors.append( + f"Edge {i} missing required field 'source_file' " + f"(got {edge[field]!r}; use '{EXTERNAL_SENTINEL}' " + f"for cross-corpus symbols)" + ) if "confidence" in edge and edge["confidence"] not in VALID_CONFIDENCES: errors.append( f"Edge {i} has invalid confidence '{edge['confidence']}' "