From 639bb6493e23ebb8d0ed950ccc5dad870533c181 Mon Sep 17 00:00:00 2001 From: Kedar Vartak Date: Wed, 27 May 2026 09:42:08 +0530 Subject: [PATCH 1/5] fix: update file node ID generation to use _file_stem for consistency and compatibility with cached graphs --- graphify/build.py | 27 ++++++++++++++ graphify/extract.py | 78 +++++++++++++++++----------------------- tests/test_rationale.py | 79 +++++++++++++++++++++++------------------ 3 files changed, 103 insertions(+), 81 deletions(-) diff --git a/graphify/build.py b/graphify/build.py index f180419db..0ace617d6 100644 --- a/graphify/build.py +++ b/graphify/build.py @@ -65,6 +65,21 @@ def _normalize_id(s: str) -> str: return cleaned.strip("_").casefold() +_EXT_SUFFIXES = re.compile(r"_(py|ts|tsx|js|jsx|mjs|rb|go|java|cs|cpp|c|h|rs|kt|scala|php|swift|sql|r|lua|ex|exs|erl|hs|clj|cljs|ml|fs|fsx|vb|groovy|gradle|sh|bash|zsh|ps1|psm1|tf|hcl|yaml|yml|json|toml|xml|html|css|scss|sass|less|vue|svelte|md|rst|txt)$") + + +def _strip_ext_suffix(nid: str) -> str: + """Strip a trailing language-extension suffix from a legacy node ID. + + Old builds used _make_id(str(path)) which appended the file extension as an + underscore-separated token (e.g. script_pipeline_step_py). New builds use + _make_id(_file_stem(path)) which omits the extension. Stripping the suffix + here lets existing cached graph.json files merge correctly with new extractions + without requiring --force re-extraction. + """ + return _EXT_SUFFIXES.sub("", nid) + + def _norm_source_file(p: str | None, root: str | None = None) -> str | None: """Normalize path separators and relativize absolute paths. @@ -144,6 +159,18 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat ft = node.get("file_type", "") if ft and ft not in {"code", "document", "paper", "image", "rationale", "concept"}: node["file_type"] = _FILE_TYPE_SYNONYMS.get(ft, "concept") + # Strip legacy extension suffix from file-level node IDs (e.g. script_pipeline_step_py + # → script_pipeline_step) so old cached graphs merge with new AST extractions (#952). + if "id" in node: + node["id"] = _strip_ext_suffix(node["id"]) + + for edge in extraction.get("edges", []): + if not isinstance(edge, dict): + continue + if "source" in edge: + edge["source"] = _strip_ext_suffix(edge["source"]) + if "target" in edge: + edge["target"] = _strip_ext_suffix(edge["target"]) errors = validate_extraction(extraction) # Dangling edges (stdlib/external imports) are expected - only warn about real schema errors. diff --git a/graphify/extract.py b/graphify/extract.py index 88372d173..1a46d71d4 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -1682,7 +1682,7 @@ def ensure_named_node(name: str, line: int) -> str: add_node(nid, name, line) return nid - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def walk(node, parent_class_nid: str | None = None) -> None: @@ -2524,7 +2524,7 @@ def _extract_python_rationale(path: Path, result: dict) -> None: nodes = result["nodes"] edges = result["edges"] seen_ids = {n["id"] for n in nodes} - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) def _get_docstring(body_node) -> tuple[str, int] | None: if not body_node: @@ -2542,25 +2542,11 @@ def _get_docstring(body_node) -> tuple[str, int] | None: def _add_rationale(text: str, line: int, parent_nid: str) -> None: label = text[:80].replace("\r\n", " ").replace("\r", " ").replace("\n", " ").strip() - rid = _make_id(stem, "rationale", str(line)) - if rid not in seen_ids: - seen_ids.add(rid) - nodes.append({ - "id": rid, - "label": label, - "file_type": "rationale", - "source_file": str_path, - "source_location": f"L{line}", - }) - edges.append({ - "source": rid, - "target": parent_nid, - "relation": "rationale_for", - "confidence": "EXTRACTED", - "source_file": str_path, - "source_location": f"L{line}", - "weight": 1.0, - }) + for n in nodes: + if n["id"] == parent_nid: + if "rationale" not in n: + n["rationale"] = label + break # Module-level docstring — skip for auto-generated files (Alembic, Django # migrations, protobuf stubs, etc.) whose module docstrings are revision @@ -2969,7 +2955,7 @@ def _add_edge(src: str, tgt: str, relation: str, line: int, plain_method_re = _re.compile(r"""^\s*def\s+(\w+)\s*\(""") current_class_nid: str | None = None - file_nid = _make_id(str_path) + file_nid = _make_id(_file_stem(path)) # Ensure the file node exists (tree-sitter pass may have emitted it) if file_nid not in seen_ids: @@ -3064,7 +3050,7 @@ def extract_blade(path: Path) -> dict: except OSError: return {"error": f"cannot read {path}"} - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) nodes = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str(path), "source_location": None}] edges = [] @@ -3112,7 +3098,7 @@ def extract_dart(path: Path) -> dict: # Use stem (not str(path)) for child IDs to keep them machine-independent. stem = _file_stem(path) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) nodes = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str(path), "source_location": None}] edges = [] @@ -3194,7 +3180,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, "confidence": confidence, "confidence_score": score, "source_file": str_path, "source_location": f"L{line}", "weight": 1.0}) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def walk(node, module_nid: str | None = None) -> None: @@ -3281,7 +3267,7 @@ def extract_sql(path: Path) -> dict: stem = _file_stem(path) str_path = str(path) - file_nid = _make_id(str_path) + file_nid = _make_id(_file_stem(path)) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": None}] edges: list[dict] = [] @@ -3589,7 +3575,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def _func_name_from_signature(sig_node) -> str | None: @@ -3841,7 +3827,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def _fortran_name(stmt_node) -> str | None: @@ -4013,7 +3999,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def walk(node) -> None: @@ -4241,7 +4227,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def walk(node, parent_impl_nid: str | None = None) -> None: @@ -4420,7 +4406,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def _extract_import(node) -> None: @@ -4590,7 +4576,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) _PS_SKIP = frozenset({ @@ -6042,7 +6028,7 @@ def _resolve_cross_file_java_imports( new_edges: list[dict] = [] seen_pairs: set[tuple[str, str]] = set() for path in paths: - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) try: source = path.read_bytes() tree = parser.parse(source) @@ -6129,7 +6115,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def _read(node) -> str: @@ -6331,7 +6317,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) _IMPORT_KEYWORDS = frozenset({"alias", "import", "require", "use"}) @@ -6523,7 +6509,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, "confidence": confidence, "source_file": str_path, "source_location": f"L{line}", "weight": weight}) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) # Track heading stack for nesting: [(level, nid), ...] @@ -6861,7 +6847,7 @@ def _add_edge(src: str, tgt: str, relation: str, line: int, context: str | None def _lineno(text: str, offset: int) -> int: return text.count("\n", 0, offset) + 1 - file_nid = _make_id(str_path) + file_nid = _make_id(_file_stem(path)) _add_node(file_nid, path.name, 1) stripped = _pascal_strip_comments(raw) @@ -7036,7 +7022,7 @@ def add_edge( edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) module_nid = file_nid @@ -7259,7 +7245,7 @@ def add_edge( edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) obj_re = re.compile(r"^\s*object\s+\w+\s*:\s*(\w+)", re.IGNORECASE) @@ -7359,7 +7345,7 @@ def add_edge( edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) obj_re = re.compile(r"^\s*object\s+\w+\s*:\s*(\w+)", re.IGNORECASE) @@ -7471,7 +7457,7 @@ def add_edge(src: str, tgt: str, relation: str, context: str | None = None) -> N edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name) name_elem = xml_root.find(".//Package/Name") @@ -7570,7 +7556,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) # file_nid is fully path-derived and never produced by _make_id(stem, func_name), # so appending "__entry" guarantees a distinct ID from any function node. entry_nid = file_nid + "__entry" @@ -7756,7 +7742,7 @@ def extract_sln(path: Path) -> dict: except OSError: return {"nodes": [], "edges": [], "error": f"cannot read {path}"} - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) str_path = str(path) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": None}] @@ -7843,7 +7829,7 @@ def extract_csproj(path: Path) -> dict: except ET.ParseError as e: return {"nodes": [], "edges": [], "error": f"XML parse error: {e}"} - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) str_path = str(path) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": None}] @@ -7944,7 +7930,7 @@ def extract_razor(path: Path) -> dict: except OSError: return {"nodes": [], "edges": [], "error": f"cannot read {path}"} - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) str_path = str(path) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": None}] @@ -8103,7 +8089,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def _key_text(pair_node) -> str | None: diff --git a/tests/test_rationale.py b/tests/test_rationale.py index 67bd3df97..25590da47 100644 --- a/tests/test_rationale.py +++ b/tests/test_rationale.py @@ -1,4 +1,8 @@ -"""Tests for rationale/docstring extraction in extract.py.""" +"""Tests for rationale/docstring extraction in extract.py. + +Rationale is now stored as a 'rationale' attribute on the parent node +rather than as separate rationale nodes with rationale_for edges. +""" import textwrap from pathlib import Path import pytest @@ -11,15 +15,19 @@ def _write_py(tmp_path: Path, code: str) -> Path: return p +def _nodes_with_rationale(result): + return [n for n in result["nodes"] if n.get("rationale")] + + def test_module_docstring_extracted(tmp_path): path = _write_py(tmp_path, ''' """This module handles authentication because legacy sessions were insecure.""" def login(): pass ''') result = extract_python(path) - rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] - assert len(rationale) >= 1 - assert any("authentication" in n["label"] for n in rationale) + nodes = _nodes_with_rationale(result) + assert len(nodes) >= 1 + assert any("authentication" in n["rationale"] for n in nodes) def test_function_docstring_extracted(tmp_path): @@ -29,8 +37,8 @@ def process(): pass ''') result = extract_python(path) - rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] - assert any("chunked" in n["label"] for n in rationale) + nodes = _nodes_with_rationale(result) + assert any("chunked" in n["rationale"] for n in nodes) def test_class_docstring_extracted(tmp_path): @@ -40,8 +48,8 @@ class Cache: pass ''') result = extract_python(path) - rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] - assert any("Redis" in n["label"] for n in rationale) + nodes = _nodes_with_rationale(result) + assert any("Redis" in n["rationale"] for n in nodes) def test_rationale_comment_extracted(tmp_path): @@ -51,11 +59,12 @@ def build(): pass ''') result = extract_python(path) - rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] - assert any("NOTE" in n["label"] for n in rationale) + nodes = _nodes_with_rationale(result) + assert any("NOTE" in n["rationale"] for n in nodes) -def test_rationale_for_edges_present(tmp_path): +def test_no_rationale_for_edges(tmp_path): + """Rationale is now an attribute, not a node+edge — no rationale_for edges.""" path = _write_py(tmp_path, ''' """Module docstring explaining the why.""" def foo(): @@ -64,29 +73,31 @@ def foo(): ''') result = extract_python(path) rationale_edges = [e for e in result["edges"] if e.get("relation") == "rationale_for"] - assert len(rationale_edges) >= 1 + assert len(rationale_edges) == 0 -def test_short_docstring_ignored(tmp_path): - """Trivial docstrings under 20 chars should not become rationale nodes.""" +def test_no_rationale_nodes(tmp_path): + """No separate rationale nodes are emitted — rationale lives on parent nodes.""" path = _write_py(tmp_path, ''' + """Module docstring explaining the why.""" def foo(): - """Constructor.""" + """Function docstring with rationale.""" pass ''') result = extract_python(path) - rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] - assert len(rationale) == 0 + rationale_nodes = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + assert len(rationale_nodes) == 0 -def test_rationale_confidence_is_extracted(tmp_path): +def test_short_docstring_ignored(tmp_path): + """Trivial docstrings under 20 chars should not set the rationale attribute.""" path = _write_py(tmp_path, ''' - """This module exists because we needed a standalone parser.""" - def parse(): pass + def foo(): + """Constructor.""" + pass ''') result = extract_python(path) - rationale_edges = [e for e in result["edges"] if e.get("relation") == "rationale_for"] - assert all(e.get("confidence") == "EXTRACTED" for e in rationale_edges) + assert len(_nodes_with_rationale(result)) == 0 def test_alembic_module_docstring_suppressed(tmp_path): @@ -108,8 +119,8 @@ def downgrade(): pass ''') result = extract_python(path) - rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] - assert not any("Revision ID" in n["label"] for n in rationale) + nodes = _nodes_with_rationale(result) + assert not any("Revision ID" in n["rationale"] for n in nodes) def test_alembic_function_docstrings_still_extracted(tmp_path): @@ -127,11 +138,9 @@ def downgrade(): pass ''') result = extract_python(path) - rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] - # module docstring suppressed - assert not any("Revision ID" in n["label"] for n in rationale) - # function docstring still captured - assert any("auth" in n["label"] for n in rationale) + nodes = _nodes_with_rationale(result) + assert not any("Revision ID" in n["rationale"] for n in nodes) + assert any("auth" in n["rationale"] for n in nodes) def test_non_migration_revision_var_not_suppressed(tmp_path): @@ -143,8 +152,8 @@ def test_non_migration_revision_var_not_suppressed(tmp_path): def get_revision(): pass ''') result = extract_python(path) - rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] - assert any("audit history" in n["label"] for n in rationale) + nodes = _nodes_with_rationale(result) + assert any("audit history" in n["rationale"] for n in nodes) def test_django_migration_module_docstring_suppressed(tmp_path): @@ -157,8 +166,8 @@ class Migration(migrations.Migration): operations = [] ''') result = extract_python(path) - rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] - assert not any("post_priority" in n["label"] for n in rationale) + nodes = _nodes_with_rationale(result) + assert not any("post_priority" in n["rationale"] for n in nodes) def test_generated_file_module_docstring_suppressed(tmp_path): @@ -170,5 +179,5 @@ class UserMessage: pass ''') result = extract_python(path) - rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] - assert not any("protocol buffer" in n["label"].lower() for n in rationale) + nodes = _nodes_with_rationale(result) + assert not any("protocol buffer" in n["rationale"].lower() for n in nodes) From 34f97da8164daf708945a0ad4b008151021987f4 Mon Sep 17 00:00:00 2001 From: Kedar Vartak Date: Wed, 27 May 2026 16:20:33 +0530 Subject: [PATCH 2/5] fix: normalize file node IDs by using _file_stem to handle compound extensions and avoid ID collisions --- graphify/extract.py | 64 +++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/graphify/extract.py b/graphify/extract.py index 1a46d71d4..3b81b9c1c 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -47,13 +47,29 @@ def _make_id(*parts: str) -> str: return cleaned.strip("_").casefold() +# Inner extensions that appear in compound filenames (e.g. Button.svelte.ts). +# Stripping them here ensures is-mobile.svelte.ts and is-mobile.svelte share +# the same base ID, while leaving symbol names like format_c or test_py untouched. +_COMPOUND_INNER_EXTS = frozenset({".svelte", ".vue", ".astro"}) + + def _file_stem(path: Path) -> str: - """Return a stem qualified with the parent directory name to avoid ID collisions - when multiple files share the same filename in different directories (#550).""" + """Return a normalised stem qualified with the parent directory name. + + Strips compound inner extensions (e.g. .svelte from Button.svelte.ts) so + that the runes/TypeScript variant of a component produces the same base ID + as the component file itself. Avoids ID collisions for same-named files in + different directories (#550). + """ + stem = path.stem + # Strip compound inner extension: "is-mobile.svelte" → "is-mobile" + inner = Path(stem).suffix + if inner in _COMPOUND_INNER_EXTS: + stem = Path(stem).stem parent = path.parent.name if parent and parent not in (".", ""): - return f"{parent}.{path.stem}" - return path.stem + return f"{parent}.{stem}" + return stem _TSCONFIG_ALIAS_CACHE: dict[str, dict[str, str]] = {} @@ -736,7 +752,7 @@ def _resolve_js_import_target(raw: str, str_path: str) -> "tuple[str, Path | Non return None resolved_path = _resolve_js_module_path(raw, Path(str_path).parent) if resolved_path is not None: - return _make_id(str(resolved_path)), resolved_path + return _make_id(_file_stem(resolved_path)), resolved_path module_name = raw.split("/")[-1] if not module_name: return None @@ -2627,23 +2643,23 @@ def extract_svelte(path: Path) -> dict: import re as _re src = path.read_text(encoding="utf-8", errors="replace") existing_ids = {n["id"] for n in result.get("nodes", [])} - # Source file node ID must match the one _extract_generic creates: - # _make_id(str(path)) - single arg, no stem prefix. Otherwise the source - # endpoint is a phantom node and build_from_json drops the edge (#701). - file_node_id = _make_id(str(path)) + # Source file node ID must match the one _extract_generic creates via + # _make_id(_file_stem(path)). Using _file_stem here (not str(path)) + # keeps the source endpoint consistent and avoids phantom nodes (#701). + file_node_id = _make_id(_file_stem(path)) aliases = _load_tsconfig_aliases(path.parent) for m in _re.finditer(r"""import\(\s*['"]([^'"]+)['"]\s*\)""", src): raw = m.group(1) if not raw: continue if raw.startswith("."): - # Relative import - resolve to full path so IDs match file node IDs. + # Relative import - resolve via _file_stem so IDs match file node IDs. resolved = Path(os.path.normpath(path.parent / raw)) # Apply same TS/Svelte resolver fixups as static imports so dynamic # imports of bare paths and .svelte.ts rune files land on real # file nodes instead of phantom ids (#716). resolved = _resolve_js_module_path(resolved) - node_id = _make_id(str(resolved)) + node_id = _make_id(_file_stem(resolved)) stub_source_file = str(resolved) else: # Check tsconfig.json path aliases (e.g. "$lib/" -> "src/lib/", "@/" -> "src/") @@ -2657,7 +2673,7 @@ def extract_svelte(path: Path) -> dict: break if resolved_alias is not None: resolved_alias = _resolve_js_module_path(resolved_alias) - node_id = _make_id(str(resolved_alias)) + node_id = _make_id(_file_stem(resolved_alias)) stub_source_file = str(resolved_alias) else: # Bare/scoped import (node_modules) - use last segment; @@ -2709,7 +2725,7 @@ def extract_svelte(path: Path) -> dict: resolved = resolved.with_suffix(".ts") elif resolved.suffix == ".jsx": resolved = resolved.with_suffix(".tsx") - node_id = _make_id(str(resolved)) + node_id = _make_id(_file_stem(resolved)) stub_source_file = str(resolved) else: resolved_alias = None @@ -2719,7 +2735,7 @@ def extract_svelte(path: Path) -> dict: resolved_alias = Path(os.path.normpath(Path(alias_base) / rest)) break if resolved_alias is not None: - node_id = _make_id(str(resolved_alias)) + node_id = _make_id(_file_stem(resolved_alias)) stub_source_file = str(resolved_alias) else: module_name = raw.split("/")[-1] @@ -2768,7 +2784,7 @@ def extract_astro(path: Path) -> dict: import re as _re src = path.read_text(encoding="utf-8", errors="replace") existing_ids = {n["id"] for n in result.get("nodes", [])} - file_node_id = _make_id(str(path)) + file_node_id = _make_id(_file_stem(path)) aliases = _load_tsconfig_aliases(path.parent) # Dynamic imports anywhere in the file: `import('./X.astro')` is legal in # frontmatter setup code and inside expression slots. @@ -2779,7 +2795,7 @@ def extract_astro(path: Path) -> dict: if raw.startswith("."): resolved = Path(os.path.normpath(path.parent / raw)) resolved = _resolve_js_module_path(resolved) - node_id = _make_id(str(resolved)) + node_id = _make_id(_file_stem(resolved)) stub_source_file = str(resolved) else: resolved_alias = None @@ -2790,7 +2806,7 @@ def extract_astro(path: Path) -> dict: break if resolved_alias is not None: resolved_alias = _resolve_js_module_path(resolved_alias) - node_id = _make_id(str(resolved_alias)) + node_id = _make_id(_file_stem(resolved_alias)) stub_source_file = str(resolved_alias) else: module_name = raw.split("/")[-1] @@ -2845,7 +2861,7 @@ def extract_astro(path: Path) -> dict: resolved = resolved.with_suffix(".ts") elif resolved.suffix == ".jsx": resolved = resolved.with_suffix(".tsx") - node_id = _make_id(str(resolved)) + node_id = _make_id(_file_stem(resolved)) stub_source_file = str(resolved) else: resolved_alias = None @@ -2855,7 +2871,7 @@ def extract_astro(path: Path) -> dict: resolved_alias = Path(os.path.normpath(Path(alias_base) / rest)) break if resolved_alias is not None: - node_id = _make_id(str(resolved_alias)) + node_id = _make_id(_file_stem(resolved_alias)) stub_source_file = str(resolved_alias) else: module_name = raw.split("/")[-1] @@ -4929,7 +4945,7 @@ def _apply_symbol_resolution_facts( return path_by_resolved = {path.resolve(): path for path in paths} - source_file_id = {path.resolve(): _make_id(str(path)) for path in paths} + source_file_id = {path.resolve(): _make_id(_file_stem(path)) for path in paths} symbol_nodes: dict[tuple[Path, str], str] = {} for node in nodes: source_path = _js_source_path(str(node.get("source_file", "")), root) @@ -8518,12 +8534,14 @@ def extract( _augment_symbol_resolution_edges(paths, all_nodes, all_edges, root) # Remap file node IDs from absolute-path-derived to project-relative so - # graph.json edge endpoints are stable across machines (#502) + # graph.json edge endpoints are stable across machines (#502). + # Both sides go through _file_stem so compound-extension nodes (e.g. + # .svelte.ts) map to the same normalised ID as their component counterpart. id_remap: dict[str, str] = {} for path in paths: - old_id = _make_id(str(path)) + old_id = _make_id(_file_stem(path)) try: - new_id = _make_id(str(path.relative_to(root))) + new_id = _make_id(_file_stem(path.relative_to(root))) except ValueError: continue if old_id != new_id: From e3cc683d89ef65f00da48ff9836472ebcdef3265 Mon Sep 17 00:00:00 2001 From: Kedar Vartak Date: Thu, 28 May 2026 22:54:33 +0530 Subject: [PATCH 3/5] feat: implement _file_stem function for consistent file ID generation and update related extraction logic --- graphify/build.py | 45 ++++++++++++------- graphify/extract.py | 53 +++++++++++------------ graphify/symbol_resolution.py | 17 +++++--- tests/test_astro_extraction.py | 13 +++--- tests/test_build.py | 17 ++++++++ tests/test_import_extension_resolution.py | 31 ++++++------- 6 files changed, 105 insertions(+), 71 deletions(-) diff --git a/graphify/build.py b/graphify/build.py index 7dab28369..13880390d 100644 --- a/graphify/build.py +++ b/graphify/build.py @@ -51,6 +51,16 @@ } +_LANG_EXT_SUFFIX_RE = re.compile( + r"_(?:py|pyi|pyw|js|jsx|mjs|cjs|ts|tsx|vue|svelte|astro|go|rs|java|kt|kts|scala|groovy|c|h|cc|cpp|cxx|hh|hpp|hxx|rb|php|cs|swift|lua|r|sh|bash|zsh|fish|ps1|sql|html|css|scss|sass|less|md|mdx|json|yaml|yml|toml|xml|proto|graphql|gql|dart|ex|exs|erl|hrl|clj|cljs|fs|fsx|vb|pl|pm|pas|pp|dpr|dpk|inc|sol|move|sv|svh|v)$" +) + + +def _strip_ext_suffix(s: str) -> str: + """Drop legacy extension suffixes from path-derived IDs (#1033).""" + return _LANG_EXT_SUFFIX_RE.sub("", s) + + def _normalize_id(s: str) -> str: r"""Normalize an ID string the same way extract._make_id does. @@ -65,19 +75,8 @@ def _normalize_id(s: str) -> str: return cleaned.strip("_").casefold() -_EXT_SUFFIXES = re.compile(r"_(py|ts|tsx|js|jsx|mjs|rb|go|java|cs|cpp|c|h|rs|kt|scala|php|swift|sql|r|lua|ex|exs|erl|hs|clj|cljs|ml|fs|fsx|vb|groovy|gradle|sh|bash|zsh|ps1|psm1|tf|hcl|yaml|yml|json|toml|xml|html|css|scss|sass|less|vue|svelte|md|rst|txt)$") - - -def _strip_ext_suffix(nid: str) -> str: - """Strip a trailing language-extension suffix from a legacy node ID. - - Old builds used _make_id(str(path)) which appended the file extension as an - underscore-separated token (e.g. script_pipeline_step_py). New builds use - _make_id(_file_stem(path)) which omits the extension. Stripping the suffix - here lets existing cached graph.json files merge correctly with new extractions - without requiring --force re-extraction. - """ - return _EXT_SUFFIXES.sub("", nid) +def _canonical_id(s: str) -> str: + return _strip_ext_suffix(_normalize_id(str(s))) def _norm_source_file(p: str | None, root: str | None = None) -> str | None: @@ -136,6 +135,8 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat for node in extraction.get("nodes", []): if not isinstance(node, dict): continue + if node.get("id") not in (None, ""): + node["id"] = _canonical_id(str(node["id"])) if "source" in node and "source_file" not in node: # Count edges that reference this node so the warning is actionable (#479) node_id = node.get("id", "?") @@ -172,6 +173,18 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat if "target" in edge: edge["target"] = _strip_ext_suffix(edge["target"]) + for edge in extraction.get("edges", []): + if not isinstance(edge, dict): + continue + if "source" not in edge and "from" in edge: + edge["source"] = edge["from"] + if "target" not in edge and "to" in edge: + edge["target"] = edge["to"] + if edge.get("source") not in (None, ""): + edge["source"] = _canonical_id(str(edge["source"])) + if edge.get("target") not in (None, ""): + edge["target"] = _canonical_id(str(edge["target"])) + errors = validate_extraction(extraction) # Dangling edges (stdlib/external imports) are expected - only warn about real schema errors. real_errors = [e for e in errors if "does not match any node id" not in e] @@ -186,7 +199,7 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat # Normalized ID map: lets edges survive when the LLM generates IDs with # slightly different casing or punctuation than the AST extractor. # e.g. "Session_ValidateToken" maps to "session_validatetoken". - norm_to_id: dict[str, str] = {_normalize_id(nid): nid for nid in node_set} + norm_to_id: dict[str, str] = {_canonical_id(nid): nid for nid in node_set} # Iterate edges in a deterministic order. The graph is undirected and stores # direction in _src/_tgt; when two edges collapse onto the same node pair the # last write wins, so an unstable iteration order flips _src/_tgt run-to-run @@ -208,9 +221,9 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat src, tgt = edge["source"], edge["target"] # Remap mismatched IDs via normalization before dropping the edge. if src not in node_set: - src = norm_to_id.get(_normalize_id(src), src) + src = norm_to_id.get(_canonical_id(src), src) if tgt not in node_set: - tgt = norm_to_id.get(_normalize_id(tgt), tgt) + tgt = norm_to_id.get(_canonical_id(tgt), tgt) if src not in node_set or tgt not in node_set: continue # skip edges to external/stdlib nodes - expected, not an error attrs = {k: v for k, v in edge.items() if k not in ("source", "target")} diff --git a/graphify/extract.py b/graphify/extract.py index 563be9502..c379c933c 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -752,7 +752,7 @@ def _import_python(node, source: bytes, file_nid: str, stem: str, edges: list, s for _ in range(dots - 1): base = base.parent rel = (module_name.replace(".", "/") + ".py") if module_name else "__init__.py" - tgt_nid = _make_id(str(base / rel)) + tgt_nid = _make_id(_file_stem(base / rel)) else: tgt_nid = _make_id(raw) edges.append({ @@ -996,7 +996,7 @@ def _import_c(node, source: bytes, file_nid: str, stem: str, edges: list, str_pa if child.type != "system_lib_string": resolved = _resolve_c_include_path(raw, str_path) if resolved is not None: - tgt_nid = _make_id(str(resolved)) + tgt_nid = _make_id(_file_stem(resolved)) edges.append({ "source": file_nid, "target": tgt_nid, @@ -2684,8 +2684,7 @@ def extract_svelte(path: Path) -> dict: src = path.read_text(encoding="utf-8", errors="replace") existing_ids = {n["id"] for n in result.get("nodes", [])} # Source file node ID must match the one _extract_generic creates via - # _make_id(_file_stem(path)). Using _file_stem here (not str(path)) - # keeps the source endpoint consistent and avoids phantom nodes (#701). + # _make_id(_file_stem(path)); otherwise this edge points at a phantom node (#701). file_node_id = _make_id(_file_stem(path)) aliases = _load_tsconfig_aliases(path.parent) for m in _re.finditer(r"""import\(\s*['"]([^'"]+)['"]\s*\)""", src): @@ -5076,7 +5075,7 @@ def add_edge(source: str, target: str, relation: str, context: str, line: int, s if source_id is not None: add_edge( source_id, - _make_id(str(path_by_resolved.get(target_path, target_path))), + _make_id(_file_stem(path_by_resolved.get(target_path, target_path))), "re_exports", "export", star_fact.line, @@ -5100,7 +5099,7 @@ def add_edge(source: str, target: str, relation: str, context: str, line: int, s if source_id is not None: add_edge( source_id, - _make_id(str(path_by_resolved.get(origin[0], origin[0]))), + _make_id(_file_stem(path_by_resolved.get(origin[0], origin[0]))), "re_exports", "export", export_fact.line, @@ -6675,7 +6674,7 @@ def _pascal_resolve_unit(from_path: Path, unit_name: str) -> str: """Resolve a Pascal unit name to the graphify node ID of its source file. Scans all Pascal files under the project root (the highest ancestor that - directly contains .pas/.dpr files) and returns _make_id(str(matched_path)). + directly contains .pas/.dpr files) and returns _make_id(_file_stem(matched_path)). Result is cached per project root so the rglob runs at most once per project. Falls back to _make_id(unit_name) for units not found on disk (e.g. standard RTL units like SysUtils, Windows). @@ -6686,7 +6685,7 @@ def _pascal_resolve_unit(from_path: Path, unit_name: str) -> str: unit_map: dict[str, str] = {} for ext in (".pas", ".pp", ".dpr", ".dpk", ".inc"): for f in root.rglob("*" + ext): - unit_map[f.stem.lower()] = _make_id(str(f)) + unit_map[f.stem.lower()] = _make_id(_file_stem(f)) _pascal_unit_cache[root_key] = unit_map return _pascal_unit_cache[root_key].get(unit_name.lower(), _make_id(unit_name)) @@ -7613,8 +7612,8 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edges.append(edge) file_nid = _make_id(_file_stem(path)) - # file_nid is fully path-derived and never produced by _make_id(stem, func_name), - # so appending "__entry" guarantees a distinct ID from any function node. + # file_nid is the file-level parent/stem ID, so appending "__entry" + # guarantees a distinct ID from any function node. entry_nid = file_nid + "__entry" add_node(file_nid, path.name, 1, kind="file") add_node(entry_nid, f"{path.name} script", 1, kind="bash_entrypoint") @@ -7735,7 +7734,7 @@ def walk(node, parent_nid: str) -> None: # like `source ../../etc/passwd` that traverse outside # the project tree (B-1). if resolved.exists(): - tgt_nid = _make_id(str(resolved)) + tgt_nid = _make_id(_file_stem(resolved)) add_edge(file_nid, tgt_nid, "imports_from", line, context="import") else: @@ -8281,7 +8280,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def _type_path_text(node) -> str: @@ -8322,7 +8321,7 @@ def walk(node, parent_type_path: "str | None" = None, resolved = (path.parent / norm).resolve() edge: dict = { "source": file_nid, - "target": _make_id(str(resolved)) if resolved.exists() else _make_id(norm), + "target": _make_id(_file_stem(resolved)) if resolved.exists() else _make_id(norm), "relation": "imports_from" if resolved.exists() else "imports", "context": "import", "confidence": "EXTRACTED", @@ -8520,7 +8519,7 @@ def extract_dmi(path: Path) -> dict: str_path = str(path) stem = _file_stem(path) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": "L1"}] edges: list[dict] = [] @@ -8619,7 +8618,7 @@ def extract_dmm(path: Path) -> dict: return {"nodes": [], "edges": [], "error": str(e)} str_path = str(path) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": "L1"}] edges: list[dict] = [] @@ -8692,7 +8691,7 @@ def extract_dmf(path: Path) -> dict: str_path = str(path) stem = _file_stem(path) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": "L1"}] edges: list[dict] = [] @@ -9087,19 +9086,19 @@ def extract( _augment_symbol_resolution_edges(paths, all_nodes, all_edges, root) - # Remap file node IDs from absolute-path-derived to project-relative so - # graph.json edge endpoints are stable across machines (#502). - # Both sides go through _file_stem so compound-extension nodes (e.g. - # .svelte.ts) map to the same normalised ID as their component counterpart. + # Remap legacy full-path-derived file node IDs to parent/stem IDs so cached + # per-file extractions converge with fresh output (#1033). id_remap: dict[str, str] = {} for path in paths: - old_id = _make_id(_file_stem(path)) + new_id = _make_id(_file_stem(path)) + legacy_ids = {_make_id(os.fspath(path))} try: - new_id = _make_id(_file_stem(path.relative_to(root))) + legacy_ids.add(_make_id(os.fspath(path.relative_to(root)))) except ValueError: - continue - if old_id != new_id: - id_remap[old_id] = new_id + pass + for old_id in legacy_ids: + if old_id != new_id: + id_remap[old_id] = new_id if id_remap: for n in all_nodes: if n.get("id") in id_remap: @@ -9170,7 +9169,7 @@ def extract( # Map each node back to its containing file_id so we can ask # "did the caller's file import the callee's file?" - # Use relativized paths to match how file node IDs were remapped above (#502). + # Use parent/stem file IDs to match file nodes after normalization (#1033). nid_to_file_nid: dict[str, str] = {} for n in all_nodes: sf = n.get("source_file") @@ -9181,7 +9180,7 @@ def extract( sf_rel = sf_path.relative_to(root) if sf_path.is_absolute() else sf_path except ValueError: sf_rel = sf_path - nid_to_file_nid[n["id"]] = _make_id(str(sf_rel)) + nid_to_file_nid[n["id"]] = _make_id(_file_stem(sf_rel)) existing_pairs = {(e["source"], e["target"]) for e in all_edges} for rc in all_raw_calls: diff --git a/graphify/symbol_resolution.py b/graphify/symbol_resolution.py index 7bc68093a..1f044f95a 100644 --- a/graphify/symbol_resolution.py +++ b/graphify/symbol_resolution.py @@ -365,14 +365,17 @@ def _bash_make_id(*parts: str) -> str: return cleaned.strip("_").casefold() +def _file_stem(path: Path) -> str: + parent = path.parent.name + if parent and parent not in (".", ""): + return f"{parent}.{path.stem}" + return path.stem + + def _file_node_id_for_path(path: Path, root: Path) -> str: - # Resolve both sides so callers that pass relative or non-canonical roots - # get the same canonical relative path that extract()'s id_remap produces. - # _bash_make_id is an exact copy of extract._make_id, so IDs match. - try: - return _bash_make_id(str(path.resolve().relative_to(root.resolve()))) - except ValueError: - return _bash_make_id(str(path)) # path outside root: hash absolute path as fallback + # Keep Bash source-edge file IDs in sync with extract._file_stem-based + # file node IDs (#1033). + return _bash_make_id(_file_stem(path)) def resolve_bash_source_edges( diff --git a/tests/test_astro_extraction.py b/tests/test_astro_extraction.py index c21e66c24..9a50c0f4c 100644 --- a/tests/test_astro_extraction.py +++ b/tests/test_astro_extraction.py @@ -13,6 +13,7 @@ from graphify.detect import CODE_EXTENSIONS from graphify.extract import ( + _file_stem, _make_id, extract_astro, ) @@ -57,8 +58,8 @@ def test_extract_astro_picks_up_frontmatter_static_imports(tmp_path): result = extract_astro(page) targets = _import_targets(result, relation="imports_from") - assert _make_id(str(layout)) in targets - assert _make_id(str(hero)) in targets + assert _make_id(_file_stem(layout)) in targets + assert _make_id(_file_stem(hero)) in targets def test_extract_astro_handles_dynamic_import_in_frontmatter(tmp_path): @@ -75,7 +76,7 @@ def test_extract_astro_handles_dynamic_import_in_frontmatter(tmp_path): result = extract_astro(page) targets = _import_targets(result, relation="dynamic_import") - assert _make_id(str(other)) in targets + assert _make_id(_file_stem(other)) in targets def test_extract_astro_picks_up_client_side_script_imports(tmp_path): @@ -100,8 +101,8 @@ def test_extract_astro_picks_up_client_side_script_imports(tmp_path): result = extract_astro(page) targets = _import_targets(result, relation="imports_from") - assert _make_id(str(layout)) in targets - assert _make_id(str(hydrate)) in targets + assert _make_id(_file_stem(layout)) in targets + assert _make_id(_file_stem(hydrate)) in targets def test_extract_astro_no_frontmatter_does_not_crash(tmp_path): @@ -140,4 +141,4 @@ def test_extract_astro_handles_tsconfig_path_alias(tmp_path): result = extract_astro(page) targets = _import_targets(result, relation="imports_from") - assert _make_id(str(hero)) in targets + assert _make_id(_file_stem(hero)) in targets diff --git a/tests/test_build.py b/tests/test_build.py index 065d7f414..419cb63d1 100644 --- a/tests/test_build.py +++ b/tests/test_build.py @@ -346,6 +346,23 @@ def test_build_from_json_relative_source_file_unchanged(tmp_path): assert G.nodes["foo_bar"]["source_file"] == "src/foo.py" +def test_build_from_json_strips_legacy_language_extension_suffixes(): + extraction = { + "nodes": [ + {"id": "script_pipeline_step_py", "label": "pipeline_step.py", "file_type": "code"}, + {"id": "consumer", "label": "consumer", "file_type": "code"}, + ], + "edges": [ + {"source": "consumer", "target": "script_pipeline_step_py", + "relation": "imports_from", "confidence": "EXTRACTED"}, + ], + } + G = build_from_json(extraction) + assert "script_pipeline_step" in G + assert "script_pipeline_step_py" not in G + assert G.has_edge("consumer", "script_pipeline_step") + + def test_build_merge_prune_absolute_paths_match_relative_nodes(tmp_path): """#1007: manifest stores absolute paths, graph nodes store relative paths. prune_sources with absolute paths must still remove the right nodes and edges.""" diff --git a/tests/test_import_extension_resolution.py b/tests/test_import_extension_resolution.py index 0d1222c0a..7312a14aa 100644 --- a/tests/test_import_extension_resolution.py +++ b/tests/test_import_extension_resolution.py @@ -1,16 +1,17 @@ """Tests for #716 — TypeScript bare-path imports, Svelte 5 rune file imports (`from './foo.svelte'` for a `.svelte.ts` file), and directory/index.ts -imports must resolve to the actual file's node id, not a phantom. +imports must resolve to the actual file's parent/stem node id, not a phantom. Before #716, `_import_js` only rewrote `.js → .ts` and `.jsx → .tsx`. Every other shape (bare path, `.svelte → .svelte.ts`, `./foo` directory imports) -produced an id like `..._foo` while the real file's node id was `..._foo_ts`, +produced an id like `..._foo` while the real file's node id is `parent_foo`, so `build_from_json` dropped the edge as external. """ from pathlib import Path from graphify.extract import ( + _file_stem, _make_id, _resolve_js_module_path, extract_js, @@ -185,7 +186,7 @@ def test_bare_path_import_resolves_in_ts_file(tmp_path): importer = _write(tmp_path / "page.ts", "import type { GetNestedType } from './type-helpers'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Bare-path .ts import must resolve to target node id; " f"expected {expected}; got {_import_targets(result)}" @@ -199,7 +200,7 @@ def test_directory_import_resolves_to_index_ts(tmp_path): importer = _write(tmp_path / "page.ts", "import { enqueue } from './queue'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Directory import must resolve to ./queue/index.ts; " f"expected {expected}; got {_import_targets(result)}" @@ -216,7 +217,7 @@ def test_dot_svelte_import_resolves_to_dot_svelte_ts(tmp_path): importer = _write(tmp_path / "page.ts", "import { isMobile } from './is-mobile.svelte'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f".svelte → .svelte.ts resolution failed; " f"expected {expected}; got {_import_targets(result)}" @@ -233,7 +234,7 @@ def test_explicit_ts_import_still_works(tmp_path): importer = _write(tmp_path / "page.ts", "import { x } from './foo.ts'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Explicit .ts imports must still resolve; " f"expected {expected}; got {_import_targets(result)}" @@ -247,7 +248,7 @@ def test_explicit_svelte_import_still_works(tmp_path): importer = _write(tmp_path / "page.ts", "import Card from './Card.svelte'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Existing .svelte imports must resolve to the .svelte node, " f"not get redirected; expected {expected}; " @@ -285,7 +286,7 @@ def test_alias_import_with_bare_path_resolves(tmp_path): importer = _write(importer_dir / "page.ts", "import type { X } from '$lib/type-helpers'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Alias + bare-path resolution failed; " f"expected {expected}; got {_import_targets(result)}" @@ -304,7 +305,7 @@ def test_type_only_import_with_bare_path_resolves(tmp_path): importer = _write(tmp_path / "page.ts", "import type { GetNestedType } from './type-helpers'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Type-only import with bare path failed to resolve; " f"expected {expected}; got {_import_targets(result)}" @@ -342,7 +343,7 @@ def test_alias_directory_import_resolves_to_index_ts(tmp_path): importer = _write(src / "routes" / "page.ts", "import { enqueue } from '$lib/queue'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Alias + directory resolution failed; " f"expected {expected}; got {_import_targets(result)}" @@ -428,7 +429,7 @@ def test_end_to_end_multi_dot_import_resolves(tmp_path): importer = _write(tmp_path / "page.ts", "import { apply } from './tag-action.shared'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Multi-dot import failed end-to-end; " f"expected {expected}; got {_import_targets(result)}" @@ -448,7 +449,7 @@ def test_resolve_chain_alias_and_extension_compose(tmp_path): importer = _write(src / "routes" / "page.ts", "import { isMobile } from '$lib/hooks/is-mobile.svelte'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Alias + .svelte→.svelte.ts chain failed to compose; " f"expected {expected}; got {_import_targets(result)}" @@ -473,7 +474,7 @@ def test_ts_dynamic_import_bare_path_resolves(tmp_path): } """) result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) targets = {str(e.get("target") or "") for e in result["edges"] if e.get("relation") in ("imports", "imports_from")} assert expected in targets, ( @@ -498,7 +499,7 @@ def test_ts_dynamic_import_alias_with_bare_path_resolves(tmp_path): } """) result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) targets = {str(e.get("target") or "") for e in result["edges"] if e.get("relation") in ("imports", "imports_from")} assert expected in targets, ( @@ -521,7 +522,7 @@ def test_dynamic_import_bare_path_resolves(tmp_path): result = extract_svelte(importer) dyn_targets = {str(e.get("target") or "") for e in result["edges"] if e.get("relation") == "dynamic_import"} - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in dyn_targets, ( f"dynamic_import of .svelte that's actually .svelte.ts must " f"resolve through the new resolver; " From 7d85b092a8b3a9a43cfcb256468b776b8240a4d6 Mon Sep 17 00:00:00 2001 From: Kedar Vartak Date: Thu, 28 May 2026 23:10:32 +0530 Subject: [PATCH 4/5] fix: limit legacy extension stripping to file nodes --- graphify/build.py | 58 ++++++++++++++++++++++++++++++++++----------- tests/test_build.py | 20 +++++++++++++++- 2 files changed, 63 insertions(+), 15 deletions(-) diff --git a/graphify/build.py b/graphify/build.py index 13880390d..fe654e0c6 100644 --- a/graphify/build.py +++ b/graphify/build.py @@ -76,7 +76,40 @@ def _normalize_id(s: str) -> str: def _canonical_id(s: str) -> str: - return _strip_ext_suffix(_normalize_id(str(s))) + return _normalize_id(str(s)) + + +def _file_stem_from_path(path_text: str) -> str: + path = Path(path_text.replace("\\", "/")) + parent = path.parent.name + if parent and parent not in (".", ""): + return f"{parent}.{path.stem}" + return path.stem + + +def _looks_like_file_node(node: dict) -> bool: + label = str(node.get("label") or "") + source_file = str(node.get("source_file") or "") + if source_file and label == Path(source_file.replace("\\", "/")).name: + return True + return bool(label and Path(label).suffix) + + +def _legacy_file_node_id(node: dict) -> str | None: + if not _looks_like_file_node(node): + return None + node_id = _canonical_id(str(node.get("id") or "")) + source_file = str(node.get("source_file") or "") + label = str(node.get("label") or "") + candidates: set[str] = set() + if source_file: + candidates.add(_normalize_id(source_file)) + if label: + candidates.add(_strip_ext_suffix(node_id)) + if node_id in candidates or _strip_ext_suffix(node_id) in candidates: + stem_source = source_file or label + return _normalize_id(_file_stem_from_path(stem_source)) + return None def _norm_source_file(p: str | None, root: str | None = None) -> str | None: @@ -132,11 +165,19 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat extraction = dict(extraction, edges=extraction["links"]) # Canonicalize legacy node/edge schema before validation. + id_remap: dict[str, str] = {} for node in extraction.get("nodes", []): if not isinstance(node, dict): continue + original_id = str(node.get("id") or "") if node.get("id") not in (None, ""): node["id"] = _canonical_id(str(node["id"])) + file_node_id = _legacy_file_node_id(node) + if file_node_id and file_node_id != node.get("id"): + id_remap[node["id"]] = file_node_id + if original_id: + id_remap[_canonical_id(original_id)] = file_node_id + node["id"] = file_node_id if "source" in node and "source_file" not in node: # Count edges that reference this node so the warning is actionable (#479) node_id = node.get("id", "?") @@ -160,19 +201,6 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat ft = node.get("file_type", "") if ft and ft not in {"code", "document", "paper", "image", "rationale", "concept"}: node["file_type"] = _FILE_TYPE_SYNONYMS.get(ft, "concept") - # Strip legacy extension suffix from file-level node IDs (e.g. script_pipeline_step_py - # → script_pipeline_step) so old cached graphs merge with new AST extractions (#952). - if "id" in node: - node["id"] = _strip_ext_suffix(node["id"]) - - for edge in extraction.get("edges", []): - if not isinstance(edge, dict): - continue - if "source" in edge: - edge["source"] = _strip_ext_suffix(edge["source"]) - if "target" in edge: - edge["target"] = _strip_ext_suffix(edge["target"]) - for edge in extraction.get("edges", []): if not isinstance(edge, dict): continue @@ -182,8 +210,10 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat edge["target"] = edge["to"] if edge.get("source") not in (None, ""): edge["source"] = _canonical_id(str(edge["source"])) + edge["source"] = id_remap.get(edge["source"], edge["source"]) if edge.get("target") not in (None, ""): edge["target"] = _canonical_id(str(edge["target"])) + edge["target"] = id_remap.get(edge["target"], edge["target"]) errors = validate_extraction(extraction) # Dangling edges (stdlib/external imports) are expected - only warn about real schema errors. diff --git a/tests/test_build.py b/tests/test_build.py index 419cb63d1..24edaa2a4 100644 --- a/tests/test_build.py +++ b/tests/test_build.py @@ -349,7 +349,7 @@ def test_build_from_json_relative_source_file_unchanged(tmp_path): def test_build_from_json_strips_legacy_language_extension_suffixes(): extraction = { "nodes": [ - {"id": "script_pipeline_step_py", "label": "pipeline_step.py", "file_type": "code"}, + {"id": "script_pipeline_step_py", "label": "pipeline_step.py", "file_type": "code", "source_file": "script/pipeline_step.py"}, {"id": "consumer", "label": "consumer", "file_type": "code"}, ], "edges": [ @@ -363,6 +363,24 @@ def test_build_from_json_strips_legacy_language_extension_suffixes(): assert G.has_edge("consumer", "script_pipeline_step") +def test_build_from_json_does_not_strip_symbol_ids_that_look_like_extensions(): + extraction = { + "nodes": [ + {"id": "format_c", "label": "format_c()", "file_type": "code", "source_file": "script/pipeline_step.py"}, + {"id": "parse_json", "label": "parse_json()", "file_type": "code", "source_file": "script/pipeline_step.py"}, + {"id": "test_py", "label": "test_py()", "file_type": "code", "source_file": "script/pipeline_step.py"}, + ], + "edges": [ + {"source": "format_c", "target": "parse_json", "relation": "calls", "confidence": "EXTRACTED"}, + {"source": "test_py", "target": "format_c", "relation": "calls", "confidence": "EXTRACTED"}, + ], + } + G = build_from_json(extraction) + assert {"format_c", "parse_json", "test_py"}.issubset(G.nodes) + assert G.has_edge("format_c", "parse_json") + assert G.has_edge("test_py", "format_c") + + def test_build_merge_prune_absolute_paths_match_relative_nodes(tmp_path): """#1007: manifest stores absolute paths, graph nodes store relative paths. prune_sources with absolute paths must still remove the right nodes and edges.""" From 4b621e88b497ad12c4a93c10915f9a83b3b01a41 Mon Sep 17 00:00:00 2001 From: Kedar Vartak Date: Thu, 28 May 2026 23:12:37 +0530 Subject: [PATCH 5/5] test: keep rationale schema unchanged in id fix --- tests/test_rationale.py | 118 +++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 55 deletions(-) diff --git a/tests/test_rationale.py b/tests/test_rationale.py index 70d9b9c01..b52aa3909 100644 --- a/tests/test_rationale.py +++ b/tests/test_rationale.py @@ -1,8 +1,4 @@ -"""Tests for rationale/docstring extraction in extract.py. - -Rationale is now stored as a 'rationale' attribute on the parent node -rather than as separate rationale nodes with rationale_for edges. -""" +"""Tests for rationale/docstring extraction in extract.py.""" import textwrap from pathlib import Path import pytest @@ -16,19 +12,15 @@ def _write_py(tmp_path: Path, code: str) -> Path: return p -def _nodes_with_rationale(result): - return [n for n in result["nodes"] if n.get("rationale")] - - def test_module_docstring_extracted(tmp_path): path = _write_py(tmp_path, ''' """This module handles authentication because legacy sessions were insecure.""" def login(): pass ''') result = extract_python(path) - nodes = _nodes_with_rationale(result) - assert len(nodes) >= 1 - assert any("authentication" in n["rationale"] for n in nodes) + rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + assert len(rationale) >= 1 + assert any("authentication" in n["label"] for n in rationale) def test_function_docstring_extracted(tmp_path): @@ -38,8 +30,8 @@ def process(): pass ''') result = extract_python(path) - nodes = _nodes_with_rationale(result) - assert any("chunked" in n["rationale"] for n in nodes) + rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + assert any("chunked" in n["label"] for n in rationale) def test_class_docstring_extracted(tmp_path): @@ -49,8 +41,8 @@ class Cache: pass ''') result = extract_python(path) - nodes = _nodes_with_rationale(result) - assert any("Redis" in n["rationale"] for n in nodes) + rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + assert any("Redis" in n["label"] for n in rationale) def test_rationale_comment_extracted(tmp_path): @@ -60,12 +52,11 @@ def build(): pass ''') result = extract_python(path) - nodes = _nodes_with_rationale(result) - assert any("NOTE" in n["rationale"] for n in nodes) + rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + assert any("NOTE" in n["label"] for n in rationale) -def test_no_rationale_for_edges(tmp_path): - """Rationale is now an attribute, not a node+edge — no rationale_for edges.""" +def test_rationale_for_edges_present(tmp_path): path = _write_py(tmp_path, ''' """Module docstring explaining the why.""" def foo(): @@ -74,31 +65,29 @@ def foo(): ''') result = extract_python(path) rationale_edges = [e for e in result["edges"] if e.get("relation") == "rationale_for"] - assert len(rationale_edges) == 0 + assert len(rationale_edges) >= 1 -def test_no_rationale_nodes(tmp_path): - """No separate rationale nodes are emitted — rationale lives on parent nodes.""" +def test_short_docstring_ignored(tmp_path): + """Trivial docstrings under 20 chars should not become rationale nodes.""" path = _write_py(tmp_path, ''' - """Module docstring explaining the why.""" def foo(): - """Function docstring with rationale.""" + """Constructor.""" pass ''') result = extract_python(path) - rationale_nodes = [n for n in result["nodes"] if n.get("file_type") == "rationale"] - assert len(rationale_nodes) == 0 + rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + assert len(rationale) == 0 -def test_short_docstring_ignored(tmp_path): - """Trivial docstrings under 20 chars should not set the rationale attribute.""" +def test_rationale_confidence_is_extracted(tmp_path): path = _write_py(tmp_path, ''' - def foo(): - """Constructor.""" - pass + """This module exists because we needed a standalone parser.""" + def parse(): pass ''') result = extract_python(path) - assert len(_nodes_with_rationale(result)) == 0 + rationale_edges = [e for e in result["edges"] if e.get("relation") == "rationale_for"] + assert all(e.get("confidence") == "EXTRACTED" for e in rationale_edges) def test_alembic_module_docstring_suppressed(tmp_path): @@ -120,8 +109,8 @@ def downgrade(): pass ''') result = extract_python(path) - nodes = _nodes_with_rationale(result) - assert not any("Revision ID" in n["rationale"] for n in nodes) + rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + assert not any("Revision ID" in n["label"] for n in rationale) def test_alembic_function_docstrings_still_extracted(tmp_path): @@ -139,9 +128,11 @@ def downgrade(): pass ''') result = extract_python(path) - nodes = _nodes_with_rationale(result) - assert not any("Revision ID" in n["rationale"] for n in nodes) - assert any("auth" in n["rationale"] for n in nodes) + rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + # module docstring suppressed + assert not any("Revision ID" in n["label"] for n in rationale) + # function docstring still captured + assert any("auth" in n["label"] for n in rationale) def test_non_migration_revision_var_not_suppressed(tmp_path): @@ -153,8 +144,8 @@ def test_non_migration_revision_var_not_suppressed(tmp_path): def get_revision(): pass ''') result = extract_python(path) - nodes = _nodes_with_rationale(result) - assert any("audit history" in n["rationale"] for n in nodes) + rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + assert any("audit history" in n["label"] for n in rationale) def test_django_migration_module_docstring_suppressed(tmp_path): @@ -167,8 +158,8 @@ class Migration(migrations.Migration): operations = [] ''') result = extract_python(path) - nodes = _nodes_with_rationale(result) - assert not any("post_priority" in n["rationale"] for n in nodes) + rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + assert not any("post_priority" in n["label"] for n in rationale) def test_generated_file_module_docstring_suppressed(tmp_path): @@ -180,15 +171,16 @@ class UserMessage: pass ''') result = extract_python(path) - nodes = _nodes_with_rationale(result) - assert not any("protocol buffer" in n["rationale"].lower() for n in nodes) + rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"] + assert not any("protocol buffer" in n["label"].lower() for n in rationale) def test_decorated_method_node_id_is_class_qualified(tmp_path): """Regression for #1050: @property / @staticmethod / @classmethod methods - were emitted with a class-unqualified node id (e.g. ``file_baz``). The - rationale walker uses class-qualified ids, so the docstring rationale must - land on the same method node id. + were emitted with a class-unqualified node id (e.g. ``file_baz``) while the + rationale walker emitted the class-qualified id (``file_bar_baz``) as the + docstring's edge target. The mismatch caused ``build_from_json`` to drop + the rationale_for edge as dangling, orphaning the docstring node. """ path = _write_py(tmp_path, ''' class Bar: @@ -222,7 +214,7 @@ def normal(self) -> int: assert normal_id.endswith("_bar_normal"), normal_id # Each decorated method must share the same class-qualified id shape so the - # extracted rationale lands on the actual method node. + # rationale_for edge target matches the method node id. for decorated_name in ("baz", "helper", "factory"): matches = [nid for nid, n in nodes_by_id.items() if n.get("label") == f".{decorated_name}()"] @@ -239,17 +231,33 @@ def normal(self) -> int: f"the class-qualified id" ) - # Rationale is stored directly on method nodes, not as rationale_for edges. + # Every rationale_for edge's target must resolve to an actual node in the + # extraction (no dangling edges into phantom unqualified ids). + node_ids = set(nodes_by_id.keys()) + rationale_edges = [e for e in result["edges"] if e.get("relation") == "rationale_for"] + for edge in rationale_edges: + assert edge["target"] in node_ids, ( + f"rationale_for edge targets missing node id {edge['target']!r}" + ) + + # After build_from_json, each decorated-method docstring node must be + # connected (degree > 0), not an orphan dropped from the graph. g = build_from_json(result) for decorated_name in ("baz", "helper", "factory", "normal"): method_id = next( nid for nid, n in nodes_by_id.items() if n.get("label") == f".{decorated_name}()" ) - assert nodes_by_id[method_id].get("rationale"), ( - f"method node for ``.{decorated_name}()`` is missing docstring rationale" - ) - assert method_id in g.nodes, f"method node {method_id} missing from graph" - assert g.nodes[method_id].get("rationale"), ( - f"method node {method_id} lost rationale after build_from_json" + # Find rationale node attached to this method. + attached_rationale = [ + e["source"] for e in rationale_edges if e["target"] == method_id + ] + assert attached_rationale, ( + f"no rationale_for edge found for ``.{decorated_name}()`` method" ) + for r_id in attached_rationale: + assert r_id in g.nodes, f"rationale node {r_id} missing from graph" + assert g.degree(r_id) > 0, ( + f"rationale node {r_id} for ``.{decorated_name}()`` is orphaned " + f"(degree 0) after build_from_json" + )