From 428a44755bba992cd381c0c80bebbaf565d891f9 Mon Sep 17 00:00:00 2001
From: Joe McKnight <mcknight.joseph@gmail.com>
Date: Fri, 15 May 2026 16:33:35 +0100
Subject: [PATCH 1/2] test: add failing tests for <external> source_file
 sentinel contract

Pin the contract before implementing:
- AST extractor must emit source_file="<external>" for cross-corpus stub
  nodes (currently emits "")
- Validator must reject empty/None source_file as missing (currently only
  checks key presence)
- skill.md prompt must document the <external> sentinel so the LLM
  extractor stays aligned with the AST extractor

5 new failing tests + 1 fixture (sample_external_base.py).
---
 tests/fixtures/sample_external_base.py | 10 ++++
 tests/test_extract.py                  | 40 +++++++++++++++
 tests/test_skill_prompt.py             | 48 +++++++++++++++++
 tests/test_validate.py                 | 71 ++++++++++++++++++++++++++
 4 files changed, 169 insertions(+)
 create mode 100644 tests/fixtures/sample_external_base.py
 create mode 100644 tests/test_skill_prompt.py
diff --git a/tests/fixtures/sample_external_base.py b/tests/fixtures/sample_external_base.py
new file mode 100644
index 000000000..28f98bea7
--- /dev/null
+++ b/tests/fixtures/sample_external_base.py
@@ -0,0 +1,10 @@
+"""Fixture: a class that inherits from a base defined outside this corpus.
+
+The base class `ExternalBase` is not defined in any other fixture, so the AST
+extractor must emit it as a stub node so the `inherits` edge survives.
+"""
+
+
+class LocalClass(ExternalBase):  # noqa: F821 - intentionally undefined; this is a parse-only fixture
+    def method(self):
+        return 1
diff --git a/tests/test_extract.py b/tests/test_extract.py
index 2ae63eaed..2104f9d09 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -142,3 +142,43 @@ def test_calls_deduplication():
     result = extract_python(FIXTURES / "sample_calls.py")
     call_pairs = [(e["source"], e["target"]) for e in result["edges"] if e["relation"] == "calls"]
     assert len(call_pairs) == len(set(call_pairs)), "Duplicate calls edges found"
+
+
+# ---------------------------------------------------------------------------
+# source_file contract: stub nodes for cross-corpus symbols
+# ---------------------------------------------------------------------------
+
+def test_external_base_emits_sentinel_source_file():
+    """
+    When a class inherits from a symbol not defined in the parsed corpus
+    (e.g. a framework base class), the extractor adds a stub node so the
+    `inherits` edge survives. That stub MUST carry source_file="<external>"
+    rather than an empty string or None, so downstream validators can
+    distinguish "outside the corpus" from "extraction bug".
+    """
+    result = extract_python(FIXTURES / "sample_external_base.py")
+    stubs = [n for n in result["nodes"] if n["label"] == "ExternalBase"]
+    assert len(stubs) == 1, f"Expected exactly one stub for ExternalBase, got {len(stubs)}"
+    assert stubs[0]["source_file"] == "<external>", (
+        f"External-symbol stub must use the '<external>' sentinel, "
+        f"got {stubs[0]['source_file']!r}"
+    )
+
+
+def test_external_base_stub_is_never_empty_string():
+    """Regression: no node may emit source_file as an empty string."""
+    result = extract_python(FIXTURES / "sample_external_base.py")
+    for n in result["nodes"]:
+        assert n["source_file"] != "", f"Node {n['id']} has empty source_file"
+        assert n["source_file"] is not None, f"Node {n['id']} has None source_file"
+
+
+def test_inherits_edge_survives_for_external_base():
+    """The whole point of the stub: the `inherits` edge must still be emitted."""
+    result = extract_python(FIXTURES / "sample_external_base.py")
+    inherits = [e for e in result["edges"] if e["relation"] == "inherits"]
+    assert any(
+        result_node["label"] == "ExternalBase" and result_node["id"] == edge["target"]
+        for edge in inherits
+        for result_node in result["nodes"]
+    ), "Expected an inherits edge whose target is the ExternalBase stub"
diff --git a/tests/test_skill_prompt.py b/tests/test_skill_prompt.py
new file mode 100644
index 000000000..b3be23034
--- /dev/null
+++ b/tests/test_skill_prompt.py
@@ -0,0 +1,48 @@
+"""Tests for the extraction-subagent prompt template embedded in skill.md.
+
+The skill.md file is the canonical instruction set the orchestrator hands to
+extraction subagents. These tests pin the parts of the prompt that govern the
+output schema — specifically that `source_file` is required on every node and
+edge, so the validator does not later have to deal with missing values.
+"""
+from pathlib import Path
+
+SKILL_MD = Path(__file__).parent.parent / "graphify" / "skill.md"
+
+
+def _skill_text() -> str:
+    return SKILL_MD.read_text(encoding="utf-8")
+
+
+def test_skill_md_exists():
+    assert SKILL_MD.exists(), f"skill.md not found at {SKILL_MD}"
+
+
+def test_prompt_marks_source_file_required():
+    """
+    The extraction prompt must explicitly state that `source_file` is a
+    required field on every node and edge. Without this, the LLM elides it
+    on rationale/document nodes and the validator can't distinguish
+    "outside the corpus" from "LLM forgot to fill the field".
+    """
+    text = _skill_text()
+    # Look for a clear "required" statement near source_file in the prompt body.
+    # We require the literal phrase rather than scanning the JSON schema example
+    # so the requirement is human-readable to the subagent.
+    assert "source_file" in text and "required" in text.lower(), (
+        "skill.md prompt should describe source_file as required"
+    )
+
+
+def test_prompt_documents_external_sentinel():
+    """
+    The prompt should mention the `<external>` sentinel so subagents know
+    they may use it for cross-corpus symbols rather than leaving the field
+    empty. This keeps the AST-extractor contract and the LLM-extractor
+    contract aligned.
+    """
+    text = _skill_text()
+    assert "<external>" in text, (
+        "skill.md prompt should mention the '<external>' sentinel "
+        "for cross-corpus symbols"
+    )
diff --git a/tests/test_validate.py b/tests/test_validate.py
index 396e90c8c..913dbdb11 100644
--- a/tests/test_validate.py
+++ b/tests/test_validate.py
@@ -85,3 +85,74 @@ def test_assert_valid_raises_on_errors():
 
 def test_assert_valid_passes_silently():
     assert_valid(VALID)  # should not raise
+
+
+# ---------------------------------------------------------------------------
+# source_file contract: <external> sentinel + None/empty still flagged
+# ---------------------------------------------------------------------------
+
+def test_external_sentinel_source_file_accepted():
+    """
+    source_file="<external>" is a contract-level sentinel meaning "this
+    symbol lives outside the parsed corpus" (e.g. a framework base class).
+    The validator must accept it as a valid value — otherwise every
+    framework type shows up as a false-positive extraction issue.
+    """
+    data = {
+        "nodes": [
+            {"id": "n1", "label": "MyClass", "file_type": "code", "source_file": "my.py"},
+            {"id": "n2", "label": "FrameworkBase", "file_type": "code", "source_file": "<external>"},
+        ],
+        "edges": [
+            {"source": "n1", "target": "n2", "relation": "inherits",
+             "confidence": "EXTRACTED", "source_file": "my.py", "weight": 1.0},
+        ],
+    }
+    assert validate_extraction(data) == []
+
+
+def test_empty_source_file_flagged_as_missing():
+    """
+    Empty-string source_file is still a real bug (likely an LLM omitting
+    a required field). The validator must keep flagging it — the
+    <external> sentinel is the ONLY non-path string that's allowed.
+    """
+    data = {
+        "nodes": [
+            {"id": "n1", "label": "Doc", "file_type": "document", "source_file": ""},
+        ],
+        "edges": [],
+    }
+    errors = validate_extraction(data)
+    assert any("source_file" in e and "n1" in e for e in errors), (
+        f"Expected empty source_file to be flagged, got errors: {errors}"
+    )
+
+
+def test_none_source_file_flagged_as_missing():
+    """None source_file is still flagged as missing (same reason as empty string)."""
+    data = {
+        "nodes": [
+            {"id": "n1", "label": "Doc", "file_type": "document", "source_file": None},
+        ],
+        "edges": [],
+    }
+    errors = validate_extraction(data)
+    assert any("source_file" in e and "n1" in e for e in errors), (
+        f"Expected None source_file to be flagged, got errors: {errors}"
+    )
+
+
+def test_edge_external_sentinel_source_file_accepted():
+    """Same contract for edges: <external> is valid."""
+    data = {
+        "nodes": [
+            {"id": "n1", "label": "A", "file_type": "code", "source_file": "a.py"},
+            {"id": "n2", "label": "B", "file_type": "code", "source_file": "b.py"},
+        ],
+        "edges": [
+            {"source": "n1", "target": "n2", "relation": "calls",
+             "confidence": "INFERRED", "source_file": "<external>", "weight": 1.0},
+        ],
+    }
+    assert validate_extraction(data) == []

From b2c9c611bee8ae65feebc7098e17b1cf03b3aabb Mon Sep 17 00:00:00 2001
From: Joe McKnight <mcknight.joseph@gmail.com>
Date: Fri, 15 May 2026 16:37:59 +0100
Subject: [PATCH 2/2] feat: emit '<external>' sentinel for cross-corpus
 source_file

Pin the source_file contract so cross-corpus references (framework base
classes referenced via inheritance, etc.) are distinguishable from
extraction bugs:

- AST extractor: external-base stub nodes now use source_file="<external>"
  instead of an empty string. Module docstring documents the contract:
  source_file is either a path or the literal string "<external>".
- Validator: empty string and None for source_file are now flagged as
  missing required field (previously only flagged when the key itself
  was absent). The "<external>" sentinel is accepted as valid.
- skill.md (LLM extraction prompt): source_file is explicitly called out
  as REQUIRED on every node and edge, including document/rationale-style
  nodes. Prompt now references the "<external>" sentinel so the LLM
  extractor stays aligned with the AST extractor.

Why: on a polyglot corpus, every `graphify update` emits hundreds of
"missing required field 'source_file'" warnings. Some are real
extraction bugs (LLM omitted the field); most are framework types
referenced via inheritance but never defined locally. The contract
change separates real bugs from cross-corpus references at the schema
level so downstream consumers can filter cleanly.
---
 graphify/extract.py  | 25 ++++++++++++++++++++++---
 graphify/skill.md    | 10 ++++++++++
 graphify/validate.py | 27 +++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/graphify/extract.py b/graphify/extract.py
index c5ceddc00..008009a16 100644
--- a/graphify/extract.py
+++ b/graphify/extract.py
@@ -1,4 +1,20 @@
-"""Deterministic structural extraction from Python code using tree-sitter. Outputs nodes+edges dicts."""
+"""Deterministic structural extraction from source code using tree-sitter. Outputs nodes+edges dicts.
+
+source_file contract
+--------------------
+Every emitted node and edge carries a ``source_file`` field. The value is one of:
+
+* a relative or absolute path string pointing at the file that produced the node/edge, or
+* the literal string ``"<external>"`` — a sentinel meaning "this symbol lives outside the
+  parsed corpus" (e.g. a framework base class referenced via inheritance but never defined
+  in any input file). Stub nodes for cross-corpus symbols use this sentinel so the
+  ``inherits`` / ``references`` edge survives without misleading downstream validators
+  into thinking the extractor lost the source.
+
+``""`` and ``None`` are NOT valid values. The validator treats them as missing fields so
+the LLM-extraction path can be caught when it omits the field. See
+``graphify/validate.py``.
+"""
 from __future__ import annotations
 import json
 import re
@@ -103,14 +119,17 @@ def walk(node, parent_class_nid: str | None = None) -> None:
                         # Try same-file base first; fall back to a bare stub
                         base_nid = _make_id(stem, base)
                         if base_nid not in seen_ids:
-                            # External or forward-declared base - add a stub so edge survives
+                            # External or forward-declared base - add a stub so edge survives.
+                            # The stub uses source_file="<external>" (see module docstring)
+                            # so downstream validators can tell "lives outside the corpus"
+                            # apart from "extractor dropped the field".
                             base_nid = _make_id(base)
                             if base_nid not in seen_ids:
                                 nodes.append({
                                     "id": base_nid,
                                     "label": base,
                                     "file_type": "code",
-                                    "source_file": "",
+                                    "source_file": "<external>",
                                     "source_location": "",
                                 })
                                 seen_ids.add(base_nid)
diff --git a/graphify/skill.md b/graphify/skill.md
index c9683689f..5eeb10895 100644
--- a/graphify/skill.md
+++ b/graphify/skill.md
@@ -232,6 +232,16 @@ confidence_score rules:
   Reasonable but not certain: 0.6-0.7. Weak inference: 0.4-0.5.
 - AMBIGUOUS edges: score 0.1-0.3
 
+source_file is REQUIRED on every node and every edge. Never omit it, never emit
+  an empty string, never emit null. Two valid forms:
+- A relative path to the file the node/edge came from (the normal case).
+- The literal string "<external>" — use this when a node represents a symbol
+  that lives outside the parsed corpus (e.g. a framework base class referenced
+  from local code but never defined locally). This sentinel keeps cross-corpus
+  references visible without making them look like extraction bugs.
+This applies to every file_type, including `document` rationale-style nodes —
+  if the rationale comes from a doc, source_file is that doc's path.
+
 Output exactly this JSON (no other text):
 {"nodes":[{"id":"filestem_entityname","label":"Human Readable Name","file_type":"code|document|paper|image","source_file":"relative/path","source_location":null,"source_url":null,"captured_at":null,"author":null,"contributor":null}],"edges":[{"source":"node_id","target":"node_id","relation":"calls|implements|references|cites|conceptually_related_to|shares_data_with|semantically_similar_to","confidence":"EXTRACTED|INFERRED|AMBIGUOUS","confidence_score":1.0,"source_file":"relative/path","source_location":null,"weight":1.0}],"hyperedges":[{"id":"snake_case_id","label":"Human Readable Label","nodes":["node_id1","node_id2","node_id3"],"relation":"participate_in|implement|form","confidence":"EXTRACTED|INFERRED","confidence_score":0.75,"source_file":"relative/path"}],"input_tokens":0,"output_tokens":0}
 ```
diff --git a/graphify/validate.py b/graphify/validate.py
index 39434091c..aa17fbe4a 100644
--- a/graphify/validate.py
+++ b/graphify/validate.py
@@ -6,6 +6,18 @@
 REQUIRED_NODE_FIELDS = {"id", "label", "file_type", "source_file"}
 REQUIRED_EDGE_FIELDS = {"source", "target", "relation", "confidence", "source_file"}
 
+# Sentinel value for `source_file` meaning "this symbol lives outside the parsed
+# corpus" (e.g. a framework base class referenced via inheritance but never
+# defined locally). See graphify/extract.py module docstring for the full
+# contract. The validator accepts this sentinel as a valid source_file so the
+# real LLM-omission bug (empty / None) stays visible.
+EXTERNAL_SENTINEL = "<external>"
+
+
+def _is_valid_source_file(value: object) -> bool:
+    """A valid source_file is a non-empty string (a path or the <external> sentinel)."""
+    return isinstance(value, str) and value != ""
+
 
 def validate_extraction(data: dict) -> list[str]:
     """
@@ -30,6 +42,15 @@ def validate_extraction(data: dict) -> list[str]:
             for field in REQUIRED_NODE_FIELDS:
                 if field not in node:
                     errors.append(f"Node {i} (id={node.get('id', '?')!r}) missing required field '{field}'")
+                elif field == "source_file" and not _is_valid_source_file(node[field]):
+                    # Empty string or None still counts as missing — distinguishes
+                    # "outside the corpus" (use the "<external>" sentinel) from
+                    # "extractor bug / LLM forgot the field".
+                    errors.append(
+                        f"Node {i} (id={node.get('id', '?')!r}) missing required field "
+                        f"'source_file' (got {node[field]!r}; use '{EXTERNAL_SENTINEL}' "
+                        f"for cross-corpus symbols)"
+                    )
             if "file_type" in node and node["file_type"] not in VALID_FILE_TYPES:
                 errors.append(
                     f"Node {i} (id={node.get('id', '?')!r}) has invalid file_type "
@@ -50,6 +71,12 @@ def validate_extraction(data: dict) -> list[str]:
             for field in REQUIRED_EDGE_FIELDS:
                 if field not in edge:
                     errors.append(f"Edge {i} missing required field '{field}'")
+                elif field == "source_file" and not _is_valid_source_file(edge[field]):
+                    errors.append(
+                        f"Edge {i} missing required field 'source_file' "
+                        f"(got {edge[field]!r}; use '{EXTERNAL_SENTINEL}' "
+                        f"for cross-corpus symbols)"
+                    )
             if "confidence" in edge and edge["confidence"] not in VALID_CONFIDENCES:
                 errors.append(
                     f"Edge {i} has invalid confidence '{edge['confidence']}' "