Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions graphify/extract.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,20 @@
"""Deterministic structural extraction from Python code using tree-sitter. Outputs nodes+edges dicts."""
"""Deterministic structural extraction from source code using tree-sitter. Outputs nodes+edges dicts.

source_file contract
--------------------
Every emitted node and edge carries a ``source_file`` field. The value is one of:

* a relative or absolute path string pointing at the file that produced the node/edge, or
* the literal string ``"<external>"`` — a sentinel meaning "this symbol lives outside the
parsed corpus" (e.g. a framework base class referenced via inheritance but never defined
in any input file). Stub nodes for cross-corpus symbols use this sentinel so the
``inherits`` / ``references`` edge survives without misleading downstream validators
into thinking the extractor lost the source.

``""`` and ``None`` are NOT valid values. The validator treats them as missing fields so
the LLM-extraction path can be caught when it omits the field. See
``graphify/validate.py``.
"""
from __future__ import annotations
import json
import re
Expand Down Expand Up @@ -103,14 +119,17 @@ def walk(node, parent_class_nid: str | None = None) -> None:
# Try same-file base first; fall back to a bare stub
base_nid = _make_id(stem, base)
if base_nid not in seen_ids:
# External or forward-declared base - add a stub so edge survives
# External or forward-declared base - add a stub so edge survives.
# The stub uses source_file="<external>" (see module docstring)
# so downstream validators can tell "lives outside the corpus"
# apart from "extractor dropped the field".
base_nid = _make_id(base)
if base_nid not in seen_ids:
nodes.append({
"id": base_nid,
"label": base,
"file_type": "code",
"source_file": "",
"source_file": "<external>",
"source_location": "",
})
seen_ids.add(base_nid)
Expand Down
10 changes: 10 additions & 0 deletions graphify/skill.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,16 @@ confidence_score rules:
Reasonable but not certain: 0.6-0.7. Weak inference: 0.4-0.5.
- AMBIGUOUS edges: score 0.1-0.3

source_file is REQUIRED on every node and every edge. Never omit it, never emit
an empty string, never emit null. Two valid forms:
- A relative path to the file the node/edge came from (the normal case).
- The literal string "<external>" — use this when a node represents a symbol
that lives outside the parsed corpus (e.g. a framework base class referenced
from local code but never defined locally). This sentinel keeps cross-corpus
references visible without making them look like extraction bugs.
This applies to every file_type, including `document` rationale-style nodes —
if the rationale comes from a doc, source_file is that doc's path.

Output exactly this JSON (no other text):
{"nodes":[{"id":"filestem_entityname","label":"Human Readable Name","file_type":"code|document|paper|image","source_file":"relative/path","source_location":null,"source_url":null,"captured_at":null,"author":null,"contributor":null}],"edges":[{"source":"node_id","target":"node_id","relation":"calls|implements|references|cites|conceptually_related_to|shares_data_with|semantically_similar_to","confidence":"EXTRACTED|INFERRED|AMBIGUOUS","confidence_score":1.0,"source_file":"relative/path","source_location":null,"weight":1.0}],"hyperedges":[{"id":"snake_case_id","label":"Human Readable Label","nodes":["node_id1","node_id2","node_id3"],"relation":"participate_in|implement|form","confidence":"EXTRACTED|INFERRED","confidence_score":0.75,"source_file":"relative/path"}],"input_tokens":0,"output_tokens":0}
```
Expand Down
27 changes: 27 additions & 0 deletions graphify/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@
REQUIRED_NODE_FIELDS = {"id", "label", "file_type", "source_file"}
REQUIRED_EDGE_FIELDS = {"source", "target", "relation", "confidence", "source_file"}

# Sentinel value for `source_file` meaning "this symbol lives outside the parsed
# corpus" (e.g. a framework base class referenced via inheritance but never
# defined locally). See graphify/extract.py module docstring for the full
# contract. The validator accepts this sentinel as a valid source_file so the
# real LLM-omission bug (empty / None) stays visible.
EXTERNAL_SENTINEL = "<external>"


def _is_valid_source_file(value: object) -> bool:
"""A valid source_file is a non-empty string (a path or the <external> sentinel)."""
return isinstance(value, str) and value != ""


def validate_extraction(data: dict) -> list[str]:
"""
Expand All @@ -30,6 +42,15 @@ def validate_extraction(data: dict) -> list[str]:
for field in REQUIRED_NODE_FIELDS:
if field not in node:
errors.append(f"Node {i} (id={node.get('id', '?')!r}) missing required field '{field}'")
elif field == "source_file" and not _is_valid_source_file(node[field]):
# Empty string or None still counts as missing — distinguishes
# "outside the corpus" (use the "<external>" sentinel) from
# "extractor bug / LLM forgot the field".
errors.append(
f"Node {i} (id={node.get('id', '?')!r}) missing required field "
f"'source_file' (got {node[field]!r}; use '{EXTERNAL_SENTINEL}' "
f"for cross-corpus symbols)"
)
if "file_type" in node and node["file_type"] not in VALID_FILE_TYPES:
errors.append(
f"Node {i} (id={node.get('id', '?')!r}) has invalid file_type "
Expand All @@ -50,6 +71,12 @@ def validate_extraction(data: dict) -> list[str]:
for field in REQUIRED_EDGE_FIELDS:
if field not in edge:
errors.append(f"Edge {i} missing required field '{field}'")
elif field == "source_file" and not _is_valid_source_file(edge[field]):
errors.append(
f"Edge {i} missing required field 'source_file' "
f"(got {edge[field]!r}; use '{EXTERNAL_SENTINEL}' "
f"for cross-corpus symbols)"
)
if "confidence" in edge and edge["confidence"] not in VALID_CONFIDENCES:
errors.append(
f"Edge {i} has invalid confidence '{edge['confidence']}' "
Expand Down
10 changes: 10 additions & 0 deletions tests/fixtures/sample_external_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Fixture: a class that inherits from a base defined outside this corpus.

The base class `ExternalBase` is not defined in any other fixture, so the AST
extractor must emit it as a stub node so the `inherits` edge survives.
"""


class LocalClass(ExternalBase): # noqa: F821 - intentionally undefined; this is a parse-only fixture
def method(self):
return 1
40 changes: 40 additions & 0 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,43 @@ def test_calls_deduplication():
result = extract_python(FIXTURES / "sample_calls.py")
call_pairs = [(e["source"], e["target"]) for e in result["edges"] if e["relation"] == "calls"]
assert len(call_pairs) == len(set(call_pairs)), "Duplicate calls edges found"


# ---------------------------------------------------------------------------
# source_file contract: stub nodes for cross-corpus symbols
# ---------------------------------------------------------------------------

def test_external_base_emits_sentinel_source_file():
"""
When a class inherits from a symbol not defined in the parsed corpus
(e.g. a framework base class), the extractor adds a stub node so the
`inherits` edge survives. That stub MUST carry source_file="<external>"
rather than an empty string or None, so downstream validators can
distinguish "outside the corpus" from "extraction bug".
"""
result = extract_python(FIXTURES / "sample_external_base.py")
stubs = [n for n in result["nodes"] if n["label"] == "ExternalBase"]
assert len(stubs) == 1, f"Expected exactly one stub for ExternalBase, got {len(stubs)}"
assert stubs[0]["source_file"] == "<external>", (
f"External-symbol stub must use the '<external>' sentinel, "
f"got {stubs[0]['source_file']!r}"
)


def test_external_base_stub_is_never_empty_string():
"""Regression: no node may emit source_file as an empty string."""
result = extract_python(FIXTURES / "sample_external_base.py")
for n in result["nodes"]:
assert n["source_file"] != "", f"Node {n['id']} has empty source_file"
assert n["source_file"] is not None, f"Node {n['id']} has None source_file"


def test_inherits_edge_survives_for_external_base():
"""The whole point of the stub: the `inherits` edge must still be emitted."""
result = extract_python(FIXTURES / "sample_external_base.py")
inherits = [e for e in result["edges"] if e["relation"] == "inherits"]
assert any(
result_node["label"] == "ExternalBase" and result_node["id"] == edge["target"]
for edge in inherits
for result_node in result["nodes"]
), "Expected an inherits edge whose target is the ExternalBase stub"
48 changes: 48 additions & 0 deletions tests/test_skill_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Tests for the extraction-subagent prompt template embedded in skill.md.

The skill.md file is the canonical instruction set the orchestrator hands to
extraction subagents. These tests pin the parts of the prompt that govern the
output schema — specifically that `source_file` is required on every node and
edge, so the validator does not later have to deal with missing values.
"""
from pathlib import Path

SKILL_MD = Path(__file__).parent.parent / "graphify" / "skill.md"


def _skill_text() -> str:
return SKILL_MD.read_text(encoding="utf-8")


def test_skill_md_exists():
assert SKILL_MD.exists(), f"skill.md not found at {SKILL_MD}"


def test_prompt_marks_source_file_required():
"""
The extraction prompt must explicitly state that `source_file` is a
required field on every node and edge. Without this, the LLM elides it
on rationale/document nodes and the validator can't distinguish
"outside the corpus" from "LLM forgot to fill the field".
"""
text = _skill_text()
# Look for a clear "required" statement near source_file in the prompt body.
# We require the literal phrase rather than scanning the JSON schema example
# so the requirement is human-readable to the subagent.
assert "source_file" in text and "required" in text.lower(), (
"skill.md prompt should describe source_file as required"
)


def test_prompt_documents_external_sentinel():
"""
The prompt should mention the `<external>` sentinel so subagents know
they may use it for cross-corpus symbols rather than leaving the field
empty. This keeps the AST-extractor contract and the LLM-extractor
contract aligned.
"""
text = _skill_text()
assert "<external>" in text, (
"skill.md prompt should mention the '<external>' sentinel "
"for cross-corpus symbols"
)
71 changes: 71 additions & 0 deletions tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,74 @@ def test_assert_valid_raises_on_errors():

def test_assert_valid_passes_silently():
assert_valid(VALID) # should not raise


# ---------------------------------------------------------------------------
# source_file contract: <external> sentinel + None/empty still flagged
# ---------------------------------------------------------------------------

def test_external_sentinel_source_file_accepted():
"""
source_file="<external>" is a contract-level sentinel meaning "this
symbol lives outside the parsed corpus" (e.g. a framework base class).
The validator must accept it as a valid value — otherwise every
framework type shows up as a false-positive extraction issue.
"""
data = {
"nodes": [
{"id": "n1", "label": "MyClass", "file_type": "code", "source_file": "my.py"},
{"id": "n2", "label": "FrameworkBase", "file_type": "code", "source_file": "<external>"},
],
"edges": [
{"source": "n1", "target": "n2", "relation": "inherits",
"confidence": "EXTRACTED", "source_file": "my.py", "weight": 1.0},
],
}
assert validate_extraction(data) == []


def test_empty_source_file_flagged_as_missing():
"""
Empty-string source_file is still a real bug (likely an LLM omitting
a required field). The validator must keep flagging it — the
<external> sentinel is the ONLY non-path string that's allowed.
"""
data = {
"nodes": [
{"id": "n1", "label": "Doc", "file_type": "document", "source_file": ""},
],
"edges": [],
}
errors = validate_extraction(data)
assert any("source_file" in e and "n1" in e for e in errors), (
f"Expected empty source_file to be flagged, got errors: {errors}"
)


def test_none_source_file_flagged_as_missing():
"""None source_file is still flagged as missing (same reason as empty string)."""
data = {
"nodes": [
{"id": "n1", "label": "Doc", "file_type": "document", "source_file": None},
],
"edges": [],
}
errors = validate_extraction(data)
assert any("source_file" in e and "n1" in e for e in errors), (
f"Expected None source_file to be flagged, got errors: {errors}"
)


def test_edge_external_sentinel_source_file_accepted():
"""Same contract for edges: <external> is valid."""
data = {
"nodes": [
{"id": "n1", "label": "A", "file_type": "code", "source_file": "a.py"},
{"id": "n2", "label": "B", "file_type": "code", "source_file": "b.py"},
],
"edges": [
{"source": "n1", "target": "n2", "relation": "calls",
"confidence": "INFERRED", "source_file": "<external>", "weight": 1.0},
],
}
assert validate_extraction(data) == []