From 639bb6493e23ebb8d0ed950ccc5dad870533c181 Mon Sep 17 00:00:00 2001
From: Kedar Vartak <kedar.vartak22@vit.edu>
Date: Wed, 27 May 2026 09:42:08 +0530
Subject: [PATCH 1/5] fix: update file node ID generation to use _file_stem for
 consistency and compatibility with cached graphs

---
 graphify/build.py       | 27 ++++++++++++++
 graphify/extract.py     | 78 +++++++++++++++++-----------------------
 tests/test_rationale.py | 79 +++++++++++++++++++++++------------------
 3 files changed, 103 insertions(+), 81 deletions(-)

diff --git a/graphify/build.py b/graphify/build.py
index f180419db..0ace617d6 100644
--- a/graphify/build.py
+++ b/graphify/build.py
@@ -65,6 +65,21 @@ def _normalize_id(s: str) -> str:
     return cleaned.strip("_").casefold()
 
 
+_EXT_SUFFIXES = re.compile(r"_(py|ts|tsx|js|jsx|mjs|rb|go|java|cs|cpp|c|h|rs|kt|scala|php|swift|sql|r|lua|ex|exs|erl|hs|clj|cljs|ml|fs|fsx|vb|groovy|gradle|sh|bash|zsh|ps1|psm1|tf|hcl|yaml|yml|json|toml|xml|html|css|scss|sass|less|vue|svelte|md|rst|txt)$")
+
+
+def _strip_ext_suffix(nid: str) -> str:
+    """Strip a trailing language-extension suffix from a legacy node ID.
+
+    Old builds used _make_id(str(path)) which appended the file extension as an
+    underscore-separated token (e.g. script_pipeline_step_py). New builds use
+    _make_id(_file_stem(path)) which omits the extension. Stripping the suffix
+    here lets existing cached graph.json files merge correctly with new extractions
+    without requiring --force re-extraction.
+    """
+    return _EXT_SUFFIXES.sub("", nid)
+
+
 def _norm_source_file(p: str | None, root: str | None = None) -> str | None:
     """Normalize path separators and relativize absolute paths.
 
@@ -144,6 +159,18 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
         ft = node.get("file_type", "")
         if ft and ft not in {"code", "document", "paper", "image", "rationale", "concept"}:
             node["file_type"] = _FILE_TYPE_SYNONYMS.get(ft, "concept")
+        # Strip legacy extension suffix from file-level node IDs (e.g. script_pipeline_step_py
+        # → script_pipeline_step) so old cached graphs merge with new AST extractions (#952).
+        if "id" in node:
+            node["id"] = _strip_ext_suffix(node["id"])
+
+    for edge in extraction.get("edges", []):
+        if not isinstance(edge, dict):
+            continue
+        if "source" in edge:
+            edge["source"] = _strip_ext_suffix(edge["source"])
+        if "target" in edge:
+            edge["target"] = _strip_ext_suffix(edge["target"])
 
     errors = validate_extraction(extraction)
     # Dangling edges (stdlib/external imports) are expected - only warn about real schema errors.
diff --git a/graphify/extract.py b/graphify/extract.py
index 88372d173..1a46d71d4 100644
--- a/graphify/extract.py
+++ b/graphify/extract.py
@@ -1682,7 +1682,7 @@ def ensure_named_node(name: str, line: int) -> str:
             add_node(nid, name, line)
         return nid
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     def walk(node, parent_class_nid: str | None = None) -> None:
@@ -2524,7 +2524,7 @@ def _extract_python_rationale(path: Path, result: dict) -> None:
     nodes = result["nodes"]
     edges = result["edges"]
     seen_ids = {n["id"] for n in nodes}
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
 
     def _get_docstring(body_node) -> tuple[str, int] | None:
         if not body_node:
@@ -2542,25 +2542,11 @@ def _get_docstring(body_node) -> tuple[str, int] | None:
 
     def _add_rationale(text: str, line: int, parent_nid: str) -> None:
         label = text[:80].replace("\r\n", " ").replace("\r", " ").replace("\n", " ").strip()
-        rid = _make_id(stem, "rationale", str(line))
-        if rid not in seen_ids:
-            seen_ids.add(rid)
-            nodes.append({
-                "id": rid,
-                "label": label,
-                "file_type": "rationale",
-                "source_file": str_path,
-                "source_location": f"L{line}",
-            })
-        edges.append({
-            "source": rid,
-            "target": parent_nid,
-            "relation": "rationale_for",
-            "confidence": "EXTRACTED",
-            "source_file": str_path,
-            "source_location": f"L{line}",
-            "weight": 1.0,
-        })
+        for n in nodes:
+            if n["id"] == parent_nid:
+                if "rationale" not in n:
+                    n["rationale"] = label
+                break
 
     # Module-level docstring — skip for auto-generated files (Alembic, Django
     # migrations, protobuf stubs, etc.) whose module docstrings are revision
@@ -2969,7 +2955,7 @@ def _add_edge(src: str, tgt: str, relation: str, line: int,
     plain_method_re = _re.compile(r"""^\s*def\s+(\w+)\s*\(""")
 
     current_class_nid: str | None = None
-    file_nid = _make_id(str_path)
+    file_nid = _make_id(_file_stem(path))
 
     # Ensure the file node exists (tree-sitter pass may have emitted it)
     if file_nid not in seen_ids:
@@ -3064,7 +3050,7 @@ def extract_blade(path: Path) -> dict:
     except OSError:
         return {"error": f"cannot read {path}"}
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     nodes = [{"id": file_nid, "label": path.name, "file_type": "code",
               "source_file": str(path), "source_location": None}]
     edges = []
@@ -3112,7 +3098,7 @@ def extract_dart(path: Path) -> dict:
 
     # Use stem (not str(path)) for child IDs to keep them machine-independent.
     stem = _file_stem(path)
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     nodes = [{"id": file_nid, "label": path.name, "file_type": "code",
               "source_file": str(path), "source_location": None}]
     edges = []
@@ -3194,7 +3180,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
                       "confidence": confidence, "confidence_score": score,
                       "source_file": str_path, "source_location": f"L{line}", "weight": 1.0})
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     def walk(node, module_nid: str | None = None) -> None:
@@ -3281,7 +3267,7 @@ def extract_sql(path: Path) -> dict:
 
     stem = _file_stem(path)
     str_path = str(path)
-    file_nid = _make_id(str_path)
+    file_nid = _make_id(_file_stem(path))
     nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code",
                            "source_file": str_path, "source_location": None}]
     edges: list[dict] = []
@@ -3589,7 +3575,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     def _func_name_from_signature(sig_node) -> str | None:
@@ -3841,7 +3827,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     def _fortran_name(stmt_node) -> str | None:
@@ -4013,7 +3999,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     def walk(node) -> None:
@@ -4241,7 +4227,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     def walk(node, parent_impl_nid: str | None = None) -> None:
@@ -4420,7 +4406,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     def _extract_import(node) -> None:
@@ -4590,7 +4576,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     _PS_SKIP = frozenset({
@@ -6042,7 +6028,7 @@ def _resolve_cross_file_java_imports(
     new_edges: list[dict] = []
     seen_pairs: set[tuple[str, str]] = set()
     for path in paths:
-        file_nid = _make_id(str(path))
+        file_nid = _make_id(_file_stem(path))
         try:
             source = path.read_bytes()
             tree = parser.parse(source)
@@ -6129,7 +6115,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     def _read(node) -> str:
@@ -6331,7 +6317,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     _IMPORT_KEYWORDS = frozenset({"alias", "import", "require", "use"})
@@ -6523,7 +6509,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
                       "confidence": confidence, "source_file": str_path,
                       "source_location": f"L{line}", "weight": weight})
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     # Track heading stack for nesting: [(level, nid), ...]
@@ -6861,7 +6847,7 @@ def _add_edge(src: str, tgt: str, relation: str, line: int, context: str | None
     def _lineno(text: str, offset: int) -> int:
         return text.count("\n", 0, offset) + 1
 
-    file_nid = _make_id(str_path)
+    file_nid = _make_id(_file_stem(path))
     _add_node(file_nid, path.name, 1)
 
     stripped = _pascal_strip_comments(raw)
@@ -7036,7 +7022,7 @@ def add_edge(
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
     module_nid = file_nid
 
@@ -7259,7 +7245,7 @@ def add_edge(
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     obj_re = re.compile(r"^\s*object\s+\w+\s*:\s*(\w+)", re.IGNORECASE)
@@ -7359,7 +7345,7 @@ def add_edge(
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     obj_re   = re.compile(r"^\s*object\s+\w+\s*:\s*(\w+)", re.IGNORECASE)
@@ -7471,7 +7457,7 @@ def add_edge(src: str, tgt: str, relation: str, context: str | None = None) -> N
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name)
 
     name_elem = xml_root.find(".//Package/Name")
@@ -7570,7 +7556,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     # file_nid is fully path-derived and never produced by _make_id(stem, func_name),
     # so appending "__entry" guarantees a distinct ID from any function node.
     entry_nid = file_nid + "__entry"
@@ -7756,7 +7742,7 @@ def extract_sln(path: Path) -> dict:
     except OSError:
         return {"nodes": [], "edges": [], "error": f"cannot read {path}"}
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     str_path = str(path)
     nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code",
                           "source_file": str_path, "source_location": None}]
@@ -7843,7 +7829,7 @@ def extract_csproj(path: Path) -> dict:
     except ET.ParseError as e:
         return {"nodes": [], "edges": [], "error": f"XML parse error: {e}"}
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     str_path = str(path)
     nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code",
                           "source_file": str_path, "source_location": None}]
@@ -7944,7 +7930,7 @@ def extract_razor(path: Path) -> dict:
     except OSError:
         return {"nodes": [], "edges": [], "error": f"cannot read {path}"}
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     str_path = str(path)
     nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code",
                           "source_file": str_path, "source_location": None}]
@@ -8103,7 +8089,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     def _key_text(pair_node) -> str | None:
diff --git a/tests/test_rationale.py b/tests/test_rationale.py
index 67bd3df97..25590da47 100644
--- a/tests/test_rationale.py
+++ b/tests/test_rationale.py
@@ -1,4 +1,8 @@
-"""Tests for rationale/docstring extraction in extract.py."""
+"""Tests for rationale/docstring extraction in extract.py.
+
+Rationale is now stored as a 'rationale' attribute on the parent node
+rather than as separate rationale nodes with rationale_for edges.
+"""
 import textwrap
 from pathlib import Path
 import pytest
@@ -11,15 +15,19 @@ def _write_py(tmp_path: Path, code: str) -> Path:
     return p
 
 
+def _nodes_with_rationale(result):
+    return [n for n in result["nodes"] if n.get("rationale")]
+
+
 def test_module_docstring_extracted(tmp_path):
     path = _write_py(tmp_path, '''
         """This module handles authentication because legacy sessions were insecure."""
         def login(): pass
     ''')
     result = extract_python(path)
-    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
-    assert len(rationale) >= 1
-    assert any("authentication" in n["label"] for n in rationale)
+    nodes = _nodes_with_rationale(result)
+    assert len(nodes) >= 1
+    assert any("authentication" in n["rationale"] for n in nodes)
 
 
 def test_function_docstring_extracted(tmp_path):
@@ -29,8 +37,8 @@ def process():
             pass
     ''')
     result = extract_python(path)
-    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
-    assert any("chunked" in n["label"] for n in rationale)
+    nodes = _nodes_with_rationale(result)
+    assert any("chunked" in n["rationale"] for n in nodes)
 
 
 def test_class_docstring_extracted(tmp_path):
@@ -40,8 +48,8 @@ class Cache:
             pass
     ''')
     result = extract_python(path)
-    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
-    assert any("Redis" in n["label"] for n in rationale)
+    nodes = _nodes_with_rationale(result)
+    assert any("Redis" in n["rationale"] for n in nodes)
 
 
 def test_rationale_comment_extracted(tmp_path):
@@ -51,11 +59,12 @@ def build():
             pass
     ''')
     result = extract_python(path)
-    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
-    assert any("NOTE" in n["label"] for n in rationale)
+    nodes = _nodes_with_rationale(result)
+    assert any("NOTE" in n["rationale"] for n in nodes)
 
 
-def test_rationale_for_edges_present(tmp_path):
+def test_no_rationale_for_edges(tmp_path):
+    """Rationale is now an attribute, not a node+edge — no rationale_for edges."""
     path = _write_py(tmp_path, '''
         """Module docstring explaining the why."""
         def foo():
@@ -64,29 +73,31 @@ def foo():
     ''')
     result = extract_python(path)
     rationale_edges = [e for e in result["edges"] if e.get("relation") == "rationale_for"]
-    assert len(rationale_edges) >= 1
+    assert len(rationale_edges) == 0
 
 
-def test_short_docstring_ignored(tmp_path):
-    """Trivial docstrings under 20 chars should not become rationale nodes."""
+def test_no_rationale_nodes(tmp_path):
+    """No separate rationale nodes are emitted — rationale lives on parent nodes."""
     path = _write_py(tmp_path, '''
+        """Module docstring explaining the why."""
         def foo():
-            """Constructor."""
+            """Function docstring with rationale."""
             pass
     ''')
     result = extract_python(path)
-    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
-    assert len(rationale) == 0
+    rationale_nodes = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
+    assert len(rationale_nodes) == 0
 
 
-def test_rationale_confidence_is_extracted(tmp_path):
+def test_short_docstring_ignored(tmp_path):
+    """Trivial docstrings under 20 chars should not set the rationale attribute."""
     path = _write_py(tmp_path, '''
-        """This module exists because we needed a standalone parser."""
-        def parse(): pass
+        def foo():
+            """Constructor."""
+            pass
     ''')
     result = extract_python(path)
-    rationale_edges = [e for e in result["edges"] if e.get("relation") == "rationale_for"]
-    assert all(e.get("confidence") == "EXTRACTED" for e in rationale_edges)
+    assert len(_nodes_with_rationale(result)) == 0
 
 
 def test_alembic_module_docstring_suppressed(tmp_path):
@@ -108,8 +119,8 @@ def downgrade():
             pass
     ''')
     result = extract_python(path)
-    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
-    assert not any("Revision ID" in n["label"] for n in rationale)
+    nodes = _nodes_with_rationale(result)
+    assert not any("Revision ID" in n["rationale"] for n in nodes)
 
 
 def test_alembic_function_docstrings_still_extracted(tmp_path):
@@ -127,11 +138,9 @@ def downgrade():
             pass
     ''')
     result = extract_python(path)
-    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
-    # module docstring suppressed
-    assert not any("Revision ID" in n["label"] for n in rationale)
-    # function docstring still captured
-    assert any("auth" in n["label"] for n in rationale)
+    nodes = _nodes_with_rationale(result)
+    assert not any("Revision ID" in n["rationale"] for n in nodes)
+    assert any("auth" in n["rationale"] for n in nodes)
 
 
 def test_non_migration_revision_var_not_suppressed(tmp_path):
@@ -143,8 +152,8 @@ def test_non_migration_revision_var_not_suppressed(tmp_path):
         def get_revision(): pass
     ''')
     result = extract_python(path)
-    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
-    assert any("audit history" in n["label"] for n in rationale)
+    nodes = _nodes_with_rationale(result)
+    assert any("audit history" in n["rationale"] for n in nodes)
 
 
 def test_django_migration_module_docstring_suppressed(tmp_path):
@@ -157,8 +166,8 @@ class Migration(migrations.Migration):
             operations = []
     ''')
     result = extract_python(path)
-    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
-    assert not any("post_priority" in n["label"] for n in rationale)
+    nodes = _nodes_with_rationale(result)
+    assert not any("post_priority" in n["rationale"] for n in nodes)
 
 
 def test_generated_file_module_docstring_suppressed(tmp_path):
@@ -170,5 +179,5 @@ class UserMessage:
             pass
     ''')
     result = extract_python(path)
-    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
-    assert not any("protocol buffer" in n["label"].lower() for n in rationale)
+    nodes = _nodes_with_rationale(result)
+    assert not any("protocol buffer" in n["rationale"].lower() for n in nodes)

From 34f97da8164daf708945a0ad4b008151021987f4 Mon Sep 17 00:00:00 2001
From: Kedar Vartak <kedar.vartak22@vit.edu>
Date: Wed, 27 May 2026 16:20:33 +0530
Subject: [PATCH 2/5] fix: normalize file node IDs by using _file_stem to
 handle compound extensions and avoid ID collisions

---
 graphify/extract.py | 64 +++++++++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/graphify/extract.py b/graphify/extract.py
index 1a46d71d4..3b81b9c1c 100644
--- a/graphify/extract.py
+++ b/graphify/extract.py
@@ -47,13 +47,29 @@ def _make_id(*parts: str) -> str:
     return cleaned.strip("_").casefold()
 
 
+# Inner extensions that appear in compound filenames (e.g. Button.svelte.ts).
+# Stripping them here ensures is-mobile.svelte.ts and is-mobile.svelte share
+# the same base ID, while leaving symbol names like format_c or test_py untouched.
+_COMPOUND_INNER_EXTS = frozenset({".svelte", ".vue", ".astro"})
+
+
 def _file_stem(path: Path) -> str:
-    """Return a stem qualified with the parent directory name to avoid ID collisions
-    when multiple files share the same filename in different directories (#550)."""
+    """Return a normalised stem qualified with the parent directory name.
+
+    Strips compound inner extensions (e.g. .svelte from Button.svelte.ts) so
+    that the runes/TypeScript variant of a component produces the same base ID
+    as the component file itself.  Avoids ID collisions for same-named files in
+    different directories (#550).
+    """
+    stem = path.stem
+    # Strip compound inner extension: "is-mobile.svelte" → "is-mobile"
+    inner = Path(stem).suffix
+    if inner in _COMPOUND_INNER_EXTS:
+        stem = Path(stem).stem
     parent = path.parent.name
     if parent and parent not in (".", ""):
-        return f"{parent}.{path.stem}"
-    return path.stem
+        return f"{parent}.{stem}"
+    return stem
 
 
 _TSCONFIG_ALIAS_CACHE: dict[str, dict[str, str]] = {}
@@ -736,7 +752,7 @@ def _resolve_js_import_target(raw: str, str_path: str) -> "tuple[str, Path | Non
         return None
     resolved_path = _resolve_js_module_path(raw, Path(str_path).parent)
     if resolved_path is not None:
-        return _make_id(str(resolved_path)), resolved_path
+        return _make_id(_file_stem(resolved_path)), resolved_path
     module_name = raw.split("/")[-1]
     if not module_name:
         return None
@@ -2627,23 +2643,23 @@ def extract_svelte(path: Path) -> dict:
         import re as _re
         src = path.read_text(encoding="utf-8", errors="replace")
         existing_ids = {n["id"] for n in result.get("nodes", [])}
-        # Source file node ID must match the one _extract_generic creates:
-        # _make_id(str(path)) - single arg, no stem prefix. Otherwise the source
-        # endpoint is a phantom node and build_from_json drops the edge (#701).
-        file_node_id = _make_id(str(path))
+        # Source file node ID must match the one _extract_generic creates via
+        # _make_id(_file_stem(path)).  Using _file_stem here (not str(path))
+        # keeps the source endpoint consistent and avoids phantom nodes (#701).
+        file_node_id = _make_id(_file_stem(path))
         aliases = _load_tsconfig_aliases(path.parent)
         for m in _re.finditer(r"""import\(\s*['"]([^'"]+)['"]\s*\)""", src):
             raw = m.group(1)
             if not raw:
                 continue
             if raw.startswith("."):
-                # Relative import - resolve to full path so IDs match file node IDs.
+                # Relative import - resolve via _file_stem so IDs match file node IDs.
                 resolved = Path(os.path.normpath(path.parent / raw))
                 # Apply same TS/Svelte resolver fixups as static imports so dynamic
                 # imports of bare paths and .svelte.ts rune files land on real
                 # file nodes instead of phantom ids (#716).
                 resolved = _resolve_js_module_path(resolved)
-                node_id = _make_id(str(resolved))
+                node_id = _make_id(_file_stem(resolved))
                 stub_source_file = str(resolved)
             else:
                 # Check tsconfig.json path aliases (e.g. "$lib/" -> "src/lib/", "@/" -> "src/")
@@ -2657,7 +2673,7 @@ def extract_svelte(path: Path) -> dict:
                         break
                 if resolved_alias is not None:
                     resolved_alias = _resolve_js_module_path(resolved_alias)
-                    node_id = _make_id(str(resolved_alias))
+                    node_id = _make_id(_file_stem(resolved_alias))
                     stub_source_file = str(resolved_alias)
                 else:
                     # Bare/scoped import (node_modules) - use last segment;
@@ -2709,7 +2725,7 @@ def extract_svelte(path: Path) -> dict:
                         resolved = resolved.with_suffix(".ts")
                     elif resolved.suffix == ".jsx":
                         resolved = resolved.with_suffix(".tsx")
-                    node_id = _make_id(str(resolved))
+                    node_id = _make_id(_file_stem(resolved))
                     stub_source_file = str(resolved)
                 else:
                     resolved_alias = None
@@ -2719,7 +2735,7 @@ def extract_svelte(path: Path) -> dict:
                             resolved_alias = Path(os.path.normpath(Path(alias_base) / rest))
                             break
                     if resolved_alias is not None:
-                        node_id = _make_id(str(resolved_alias))
+                        node_id = _make_id(_file_stem(resolved_alias))
                         stub_source_file = str(resolved_alias)
                     else:
                         module_name = raw.split("/")[-1]
@@ -2768,7 +2784,7 @@ def extract_astro(path: Path) -> dict:
         import re as _re
         src = path.read_text(encoding="utf-8", errors="replace")
         existing_ids = {n["id"] for n in result.get("nodes", [])}
-        file_node_id = _make_id(str(path))
+        file_node_id = _make_id(_file_stem(path))
         aliases = _load_tsconfig_aliases(path.parent)
         # Dynamic imports anywhere in the file: `import('./X.astro')` is legal in
         # frontmatter setup code and inside expression slots.
@@ -2779,7 +2795,7 @@ def extract_astro(path: Path) -> dict:
             if raw.startswith("."):
                 resolved = Path(os.path.normpath(path.parent / raw))
                 resolved = _resolve_js_module_path(resolved)
-                node_id = _make_id(str(resolved))
+                node_id = _make_id(_file_stem(resolved))
                 stub_source_file = str(resolved)
             else:
                 resolved_alias = None
@@ -2790,7 +2806,7 @@ def extract_astro(path: Path) -> dict:
                         break
                 if resolved_alias is not None:
                     resolved_alias = _resolve_js_module_path(resolved_alias)
-                    node_id = _make_id(str(resolved_alias))
+                    node_id = _make_id(_file_stem(resolved_alias))
                     stub_source_file = str(resolved_alias)
                 else:
                     module_name = raw.split("/")[-1]
@@ -2845,7 +2861,7 @@ def extract_astro(path: Path) -> dict:
                         resolved = resolved.with_suffix(".ts")
                     elif resolved.suffix == ".jsx":
                         resolved = resolved.with_suffix(".tsx")
-                    node_id = _make_id(str(resolved))
+                    node_id = _make_id(_file_stem(resolved))
                     stub_source_file = str(resolved)
                 else:
                     resolved_alias = None
@@ -2855,7 +2871,7 @@ def extract_astro(path: Path) -> dict:
                             resolved_alias = Path(os.path.normpath(Path(alias_base) / rest))
                             break
                     if resolved_alias is not None:
-                        node_id = _make_id(str(resolved_alias))
+                        node_id = _make_id(_file_stem(resolved_alias))
                         stub_source_file = str(resolved_alias)
                     else:
                         module_name = raw.split("/")[-1]
@@ -4929,7 +4945,7 @@ def _apply_symbol_resolution_facts(
         return
 
     path_by_resolved = {path.resolve(): path for path in paths}
-    source_file_id = {path.resolve(): _make_id(str(path)) for path in paths}
+    source_file_id = {path.resolve(): _make_id(_file_stem(path)) for path in paths}
     symbol_nodes: dict[tuple[Path, str], str] = {}
     for node in nodes:
         source_path = _js_source_path(str(node.get("source_file", "")), root)
@@ -8518,12 +8534,14 @@ def extract(
     _augment_symbol_resolution_edges(paths, all_nodes, all_edges, root)
 
     # Remap file node IDs from absolute-path-derived to project-relative so
-    # graph.json edge endpoints are stable across machines (#502)
+    # graph.json edge endpoints are stable across machines (#502).
+    # Both sides go through _file_stem so compound-extension nodes (e.g.
+    # .svelte.ts) map to the same normalised ID as their component counterpart.
     id_remap: dict[str, str] = {}
     for path in paths:
-        old_id = _make_id(str(path))
+        old_id = _make_id(_file_stem(path))
         try:
-            new_id = _make_id(str(path.relative_to(root)))
+            new_id = _make_id(_file_stem(path.relative_to(root)))
         except ValueError:
             continue
         if old_id != new_id:

From e3cc683d89ef65f00da48ff9836472ebcdef3265 Mon Sep 17 00:00:00 2001
From: Kedar Vartak <kedar.vartak22@vit.edu>
Date: Thu, 28 May 2026 22:54:33 +0530
Subject: [PATCH 3/5] feat: implement _file_stem function for consistent file
 ID generation and update related extraction logic

---
 graphify/build.py                         | 45 ++++++++++++-------
 graphify/extract.py                       | 53 +++++++++++------------
 graphify/symbol_resolution.py             | 17 +++++---
 tests/test_astro_extraction.py            | 13 +++---
 tests/test_build.py                       | 17 ++++++++
 tests/test_import_extension_resolution.py | 31 ++++++-------
 6 files changed, 105 insertions(+), 71 deletions(-)

diff --git a/graphify/build.py b/graphify/build.py
index 7dab28369..13880390d 100644
--- a/graphify/build.py
+++ b/graphify/build.py
@@ -51,6 +51,16 @@
 }
 
 
+_LANG_EXT_SUFFIX_RE = re.compile(
+    r"_(?:py|pyi|pyw|js|jsx|mjs|cjs|ts|tsx|vue|svelte|astro|go|rs|java|kt|kts|scala|groovy|c|h|cc|cpp|cxx|hh|hpp|hxx|rb|php|cs|swift|lua|r|sh|bash|zsh|fish|ps1|sql|html|css|scss|sass|less|md|mdx|json|yaml|yml|toml|xml|proto|graphql|gql|dart|ex|exs|erl|hrl|clj|cljs|fs|fsx|vb|pl|pm|pas|pp|dpr|dpk|inc|sol|move|sv|svh|v)$"
+)
+
+
+def _strip_ext_suffix(s: str) -> str:
+    """Drop legacy extension suffixes from path-derived IDs (#1033)."""
+    return _LANG_EXT_SUFFIX_RE.sub("", s)
+
+
 def _normalize_id(s: str) -> str:
     r"""Normalize an ID string the same way extract._make_id does.
 
@@ -65,19 +75,8 @@ def _normalize_id(s: str) -> str:
     return cleaned.strip("_").casefold()
 
 
-_EXT_SUFFIXES = re.compile(r"_(py|ts|tsx|js|jsx|mjs|rb|go|java|cs|cpp|c|h|rs|kt|scala|php|swift|sql|r|lua|ex|exs|erl|hs|clj|cljs|ml|fs|fsx|vb|groovy|gradle|sh|bash|zsh|ps1|psm1|tf|hcl|yaml|yml|json|toml|xml|html|css|scss|sass|less|vue|svelte|md|rst|txt)$")
-
-
-def _strip_ext_suffix(nid: str) -> str:
-    """Strip a trailing language-extension suffix from a legacy node ID.
-
-    Old builds used _make_id(str(path)) which appended the file extension as an
-    underscore-separated token (e.g. script_pipeline_step_py). New builds use
-    _make_id(_file_stem(path)) which omits the extension. Stripping the suffix
-    here lets existing cached graph.json files merge correctly with new extractions
-    without requiring --force re-extraction.
-    """
-    return _EXT_SUFFIXES.sub("", nid)
+def _canonical_id(s: str) -> str:
+    return _strip_ext_suffix(_normalize_id(str(s)))
 
 
 def _norm_source_file(p: str | None, root: str | None = None) -> str | None:
@@ -136,6 +135,8 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
     for node in extraction.get("nodes", []):
         if not isinstance(node, dict):
             continue
+        if node.get("id") not in (None, ""):
+            node["id"] = _canonical_id(str(node["id"]))
         if "source" in node and "source_file" not in node:
             # Count edges that reference this node so the warning is actionable (#479)
             node_id = node.get("id", "?")
@@ -172,6 +173,18 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
         if "target" in edge:
             edge["target"] = _strip_ext_suffix(edge["target"])
 
+    for edge in extraction.get("edges", []):
+        if not isinstance(edge, dict):
+            continue
+        if "source" not in edge and "from" in edge:
+            edge["source"] = edge["from"]
+        if "target" not in edge and "to" in edge:
+            edge["target"] = edge["to"]
+        if edge.get("source") not in (None, ""):
+            edge["source"] = _canonical_id(str(edge["source"]))
+        if edge.get("target") not in (None, ""):
+            edge["target"] = _canonical_id(str(edge["target"]))
+
     errors = validate_extraction(extraction)
     # Dangling edges (stdlib/external imports) are expected - only warn about real schema errors.
     real_errors = [e for e in errors if "does not match any node id" not in e]
@@ -186,7 +199,7 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
     # Normalized ID map: lets edges survive when the LLM generates IDs with
     # slightly different casing or punctuation than the AST extractor.
     # e.g. "Session_ValidateToken" maps to "session_validatetoken".
-    norm_to_id: dict[str, str] = {_normalize_id(nid): nid for nid in node_set}
+    norm_to_id: dict[str, str] = {_canonical_id(nid): nid for nid in node_set}
     # Iterate edges in a deterministic order. The graph is undirected and stores
     # direction in _src/_tgt; when two edges collapse onto the same node pair the
     # last write wins, so an unstable iteration order flips _src/_tgt run-to-run
@@ -208,9 +221,9 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
         src, tgt = edge["source"], edge["target"]
         # Remap mismatched IDs via normalization before dropping the edge.
         if src not in node_set:
-            src = norm_to_id.get(_normalize_id(src), src)
+            src = norm_to_id.get(_canonical_id(src), src)
         if tgt not in node_set:
-            tgt = norm_to_id.get(_normalize_id(tgt), tgt)
+            tgt = norm_to_id.get(_canonical_id(tgt), tgt)
         if src not in node_set or tgt not in node_set:
             continue  # skip edges to external/stdlib nodes - expected, not an error
         attrs = {k: v for k, v in edge.items() if k not in ("source", "target")}
diff --git a/graphify/extract.py b/graphify/extract.py
index 563be9502..c379c933c 100644
--- a/graphify/extract.py
+++ b/graphify/extract.py
@@ -752,7 +752,7 @@ def _import_python(node, source: bytes, file_nid: str, stem: str, edges: list, s
                 for _ in range(dots - 1):
                     base = base.parent
                 rel = (module_name.replace(".", "/") + ".py") if module_name else "__init__.py"
-                tgt_nid = _make_id(str(base / rel))
+                tgt_nid = _make_id(_file_stem(base / rel))
             else:
                 tgt_nid = _make_id(raw)
             edges.append({
@@ -996,7 +996,7 @@ def _import_c(node, source: bytes, file_nid: str, stem: str, edges: list, str_pa
             if child.type != "system_lib_string":
                 resolved = _resolve_c_include_path(raw, str_path)
                 if resolved is not None:
-                    tgt_nid = _make_id(str(resolved))
+                    tgt_nid = _make_id(_file_stem(resolved))
                     edges.append({
                         "source": file_nid,
                         "target": tgt_nid,
@@ -2684,8 +2684,7 @@ def extract_svelte(path: Path) -> dict:
         src = path.read_text(encoding="utf-8", errors="replace")
         existing_ids = {n["id"] for n in result.get("nodes", [])}
         # Source file node ID must match the one _extract_generic creates via
-        # _make_id(_file_stem(path)).  Using _file_stem here (not str(path))
-        # keeps the source endpoint consistent and avoids phantom nodes (#701).
+        # _make_id(_file_stem(path)); otherwise this edge points at a phantom node (#701).
         file_node_id = _make_id(_file_stem(path))
         aliases = _load_tsconfig_aliases(path.parent)
         for m in _re.finditer(r"""import\(\s*['"]([^'"]+)['"]\s*\)""", src):
@@ -5076,7 +5075,7 @@ def add_edge(source: str, target: str, relation: str, context: str, line: int, s
         if source_id is not None:
             add_edge(
                 source_id,
-                _make_id(str(path_by_resolved.get(target_path, target_path))),
+                _make_id(_file_stem(path_by_resolved.get(target_path, target_path))),
                 "re_exports",
                 "export",
                 star_fact.line,
@@ -5100,7 +5099,7 @@ def add_edge(source: str, target: str, relation: str, context: str, line: int, s
             if source_id is not None:
                 add_edge(
                     source_id,
-                    _make_id(str(path_by_resolved.get(origin[0], origin[0]))),
+                    _make_id(_file_stem(path_by_resolved.get(origin[0], origin[0]))),
                     "re_exports",
                     "export",
                     export_fact.line,
@@ -6675,7 +6674,7 @@ def _pascal_resolve_unit(from_path: Path, unit_name: str) -> str:
     """Resolve a Pascal unit name to the graphify node ID of its source file.
 
     Scans all Pascal files under the project root (the highest ancestor that
-    directly contains .pas/.dpr files) and returns _make_id(str(matched_path)).
+    directly contains .pas/.dpr files) and returns _make_id(_file_stem(matched_path)).
     Result is cached per project root so the rglob runs at most once per
     project.  Falls back to _make_id(unit_name) for units not found on disk
     (e.g. standard RTL units like SysUtils, Windows).
@@ -6686,7 +6685,7 @@ def _pascal_resolve_unit(from_path: Path, unit_name: str) -> str:
         unit_map: dict[str, str] = {}
         for ext in (".pas", ".pp", ".dpr", ".dpk", ".inc"):
             for f in root.rglob("*" + ext):
-                unit_map[f.stem.lower()] = _make_id(str(f))
+                unit_map[f.stem.lower()] = _make_id(_file_stem(f))
         _pascal_unit_cache[root_key] = unit_map
     return _pascal_unit_cache[root_key].get(unit_name.lower(), _make_id(unit_name))
 
@@ -7613,8 +7612,8 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
         edges.append(edge)
 
     file_nid = _make_id(_file_stem(path))
-    # file_nid is fully path-derived and never produced by _make_id(stem, func_name),
-    # so appending "__entry" guarantees a distinct ID from any function node.
+    # file_nid is the file-level parent/stem ID, so appending "__entry"
+    # guarantees a distinct ID from any function node.
     entry_nid = file_nid + "__entry"
     add_node(file_nid, path.name, 1, kind="file")
     add_node(entry_nid, f"{path.name} script", 1, kind="bash_entrypoint")
@@ -7735,7 +7734,7 @@ def walk(node, parent_nid: str) -> None:
                             # like `source ../../etc/passwd` that traverse outside
                             # the project tree (B-1).
                             if resolved.exists():
-                                tgt_nid = _make_id(str(resolved))
+                                tgt_nid = _make_id(_file_stem(resolved))
                                 add_edge(file_nid, tgt_nid, "imports_from", line,
                                          context="import")
                         else:
@@ -8281,7 +8280,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int,
             edge["context"] = context
         edges.append(edge)
 
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     add_node(file_nid, path.name, 1)
 
     def _type_path_text(node) -> str:
@@ -8322,7 +8321,7 @@ def walk(node, parent_type_path: "str | None" = None,
                 resolved = (path.parent / norm).resolve()
                 edge: dict = {
                     "source": file_nid,
-                    "target": _make_id(str(resolved)) if resolved.exists() else _make_id(norm),
+                    "target": _make_id(_file_stem(resolved)) if resolved.exists() else _make_id(norm),
                     "relation": "imports_from" if resolved.exists() else "imports",
                     "context": "import",
                     "confidence": "EXTRACTED",
@@ -8520,7 +8519,7 @@ def extract_dmi(path: Path) -> dict:
 
     str_path = str(path)
     stem = _file_stem(path)
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code",
                            "source_file": str_path, "source_location": "L1"}]
     edges: list[dict] = []
@@ -8619,7 +8618,7 @@ def extract_dmm(path: Path) -> dict:
         return {"nodes": [], "edges": [], "error": str(e)}
 
     str_path = str(path)
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code",
                            "source_file": str_path, "source_location": "L1"}]
     edges: list[dict] = []
@@ -8692,7 +8691,7 @@ def extract_dmf(path: Path) -> dict:
 
     str_path = str(path)
     stem = _file_stem(path)
-    file_nid = _make_id(str(path))
+    file_nid = _make_id(_file_stem(path))
     nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code",
                            "source_file": str_path, "source_location": "L1"}]
     edges: list[dict] = []
@@ -9087,19 +9086,19 @@ def extract(
 
     _augment_symbol_resolution_edges(paths, all_nodes, all_edges, root)
 
-    # Remap file node IDs from absolute-path-derived to project-relative so
-    # graph.json edge endpoints are stable across machines (#502).
-    # Both sides go through _file_stem so compound-extension nodes (e.g.
-    # .svelte.ts) map to the same normalised ID as their component counterpart.
+    # Remap legacy full-path-derived file node IDs to parent/stem IDs so cached
+    # per-file extractions converge with fresh output (#1033).
     id_remap: dict[str, str] = {}
     for path in paths:
-        old_id = _make_id(_file_stem(path))
+        new_id = _make_id(_file_stem(path))
+        legacy_ids = {_make_id(os.fspath(path))}
         try:
-            new_id = _make_id(_file_stem(path.relative_to(root)))
+            legacy_ids.add(_make_id(os.fspath(path.relative_to(root))))
         except ValueError:
-            continue
-        if old_id != new_id:
-            id_remap[old_id] = new_id
+            pass
+        for old_id in legacy_ids:
+            if old_id != new_id:
+                id_remap[old_id] = new_id
     if id_remap:
         for n in all_nodes:
             if n.get("id") in id_remap:
@@ -9170,7 +9169,7 @@ def extract(
 
     # Map each node back to its containing file_id so we can ask
     # "did the caller's file import the callee's file?"
-    # Use relativized paths to match how file node IDs were remapped above (#502).
+    # Use parent/stem file IDs to match file nodes after normalization (#1033).
     nid_to_file_nid: dict[str, str] = {}
     for n in all_nodes:
         sf = n.get("source_file")
@@ -9181,7 +9180,7 @@ def extract(
             sf_rel = sf_path.relative_to(root) if sf_path.is_absolute() else sf_path
         except ValueError:
             sf_rel = sf_path
-        nid_to_file_nid[n["id"]] = _make_id(str(sf_rel))
+        nid_to_file_nid[n["id"]] = _make_id(_file_stem(sf_rel))
 
     existing_pairs = {(e["source"], e["target"]) for e in all_edges}
     for rc in all_raw_calls:
diff --git a/graphify/symbol_resolution.py b/graphify/symbol_resolution.py
index 7bc68093a..1f044f95a 100644
--- a/graphify/symbol_resolution.py
+++ b/graphify/symbol_resolution.py
@@ -365,14 +365,17 @@ def _bash_make_id(*parts: str) -> str:
     return cleaned.strip("_").casefold()
 
 
+def _file_stem(path: Path) -> str:
+    parent = path.parent.name
+    if parent and parent not in (".", ""):
+        return f"{parent}.{path.stem}"
+    return path.stem
+
+
 def _file_node_id_for_path(path: Path, root: Path) -> str:
-    # Resolve both sides so callers that pass relative or non-canonical roots
-    # get the same canonical relative path that extract()'s id_remap produces.
-    # _bash_make_id is an exact copy of extract._make_id, so IDs match.
-    try:
-        return _bash_make_id(str(path.resolve().relative_to(root.resolve())))
-    except ValueError:
-        return _bash_make_id(str(path))  # path outside root: hash absolute path as fallback
+    # Keep Bash source-edge file IDs in sync with extract._file_stem-based
+    # file node IDs (#1033).
+    return _bash_make_id(_file_stem(path))
 
 
 def resolve_bash_source_edges(
diff --git a/tests/test_astro_extraction.py b/tests/test_astro_extraction.py
index c21e66c24..9a50c0f4c 100644
--- a/tests/test_astro_extraction.py
+++ b/tests/test_astro_extraction.py
@@ -13,6 +13,7 @@
 
 from graphify.detect import CODE_EXTENSIONS
 from graphify.extract import (
+    _file_stem,
     _make_id,
     extract_astro,
 )
@@ -57,8 +58,8 @@ def test_extract_astro_picks_up_frontmatter_static_imports(tmp_path):
 
     result = extract_astro(page)
     targets = _import_targets(result, relation="imports_from")
-    assert _make_id(str(layout)) in targets
-    assert _make_id(str(hero)) in targets
+    assert _make_id(_file_stem(layout)) in targets
+    assert _make_id(_file_stem(hero)) in targets
 
 
 def test_extract_astro_handles_dynamic_import_in_frontmatter(tmp_path):
@@ -75,7 +76,7 @@ def test_extract_astro_handles_dynamic_import_in_frontmatter(tmp_path):
 
     result = extract_astro(page)
     targets = _import_targets(result, relation="dynamic_import")
-    assert _make_id(str(other)) in targets
+    assert _make_id(_file_stem(other)) in targets
 
 
 def test_extract_astro_picks_up_client_side_script_imports(tmp_path):
@@ -100,8 +101,8 @@ def test_extract_astro_picks_up_client_side_script_imports(tmp_path):
 
     result = extract_astro(page)
     targets = _import_targets(result, relation="imports_from")
-    assert _make_id(str(layout)) in targets
-    assert _make_id(str(hydrate)) in targets
+    assert _make_id(_file_stem(layout)) in targets
+    assert _make_id(_file_stem(hydrate)) in targets
 
 
 def test_extract_astro_no_frontmatter_does_not_crash(tmp_path):
@@ -140,4 +141,4 @@ def test_extract_astro_handles_tsconfig_path_alias(tmp_path):
 
     result = extract_astro(page)
     targets = _import_targets(result, relation="imports_from")
-    assert _make_id(str(hero)) in targets
+    assert _make_id(_file_stem(hero)) in targets
diff --git a/tests/test_build.py b/tests/test_build.py
index 065d7f414..419cb63d1 100644
--- a/tests/test_build.py
+++ b/tests/test_build.py
@@ -346,6 +346,23 @@ def test_build_from_json_relative_source_file_unchanged(tmp_path):
     assert G.nodes["foo_bar"]["source_file"] == "src/foo.py"
 
 
+def test_build_from_json_strips_legacy_language_extension_suffixes():
+    extraction = {
+        "nodes": [
+            {"id": "script_pipeline_step_py", "label": "pipeline_step.py", "file_type": "code"},
+            {"id": "consumer", "label": "consumer", "file_type": "code"},
+        ],
+        "edges": [
+            {"source": "consumer", "target": "script_pipeline_step_py",
+             "relation": "imports_from", "confidence": "EXTRACTED"},
+        ],
+    }
+    G = build_from_json(extraction)
+    assert "script_pipeline_step" in G
+    assert "script_pipeline_step_py" not in G
+    assert G.has_edge("consumer", "script_pipeline_step")
+
+
 def test_build_merge_prune_absolute_paths_match_relative_nodes(tmp_path):
     """#1007: manifest stores absolute paths, graph nodes store relative paths.
     prune_sources with absolute paths must still remove the right nodes and edges."""
diff --git a/tests/test_import_extension_resolution.py b/tests/test_import_extension_resolution.py
index 0d1222c0a..7312a14aa 100644
--- a/tests/test_import_extension_resolution.py
+++ b/tests/test_import_extension_resolution.py
@@ -1,16 +1,17 @@
 """Tests for #716 — TypeScript bare-path imports, Svelte 5 rune file imports
 (`from './foo.svelte'` for a `.svelte.ts` file), and directory/index.ts
-imports must resolve to the actual file's node id, not a phantom.
+imports must resolve to the actual file's parent/stem node id, not a phantom.
 
 Before #716, `_import_js` only rewrote `.js → .ts` and `.jsx → .tsx`. Every
 other shape (bare path, `.svelte → .svelte.ts`, `./foo` directory imports)
-produced an id like `..._foo` while the real file's node id was `..._foo_ts`,
+produced an id like `..._foo` while the real file's node id is `parent_foo`,
 so `build_from_json` dropped the edge as external.
 """
 
 from pathlib import Path
 
 from graphify.extract import (
+    _file_stem,
     _make_id,
     _resolve_js_module_path,
     extract_js,
@@ -185,7 +186,7 @@ def test_bare_path_import_resolves_in_ts_file(tmp_path):
     importer = _write(tmp_path / "page.ts",
                       "import type { GetNestedType } from './type-helpers'\n")
     result = extract_js(importer)
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     assert expected in _import_targets(result), (
         f"Bare-path .ts import must resolve to target node id; "
         f"expected {expected}; got {_import_targets(result)}"
@@ -199,7 +200,7 @@ def test_directory_import_resolves_to_index_ts(tmp_path):
     importer = _write(tmp_path / "page.ts",
                       "import { enqueue } from './queue'\n")
     result = extract_js(importer)
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     assert expected in _import_targets(result), (
         f"Directory import must resolve to ./queue/index.ts; "
         f"expected {expected}; got {_import_targets(result)}"
@@ -216,7 +217,7 @@ def test_dot_svelte_import_resolves_to_dot_svelte_ts(tmp_path):
     importer = _write(tmp_path / "page.ts",
                       "import { isMobile } from './is-mobile.svelte'\n")
     result = extract_js(importer)
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     assert expected in _import_targets(result), (
         f".svelte → .svelte.ts resolution failed; "
         f"expected {expected}; got {_import_targets(result)}"
@@ -233,7 +234,7 @@ def test_explicit_ts_import_still_works(tmp_path):
     importer = _write(tmp_path / "page.ts",
                       "import { x } from './foo.ts'\n")
     result = extract_js(importer)
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     assert expected in _import_targets(result), (
         f"Explicit .ts imports must still resolve; "
         f"expected {expected}; got {_import_targets(result)}"
@@ -247,7 +248,7 @@ def test_explicit_svelte_import_still_works(tmp_path):
     importer = _write(tmp_path / "page.ts",
                       "import Card from './Card.svelte'\n")
     result = extract_js(importer)
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     assert expected in _import_targets(result), (
         f"Existing .svelte imports must resolve to the .svelte node, "
         f"not get redirected; expected {expected}; "
@@ -285,7 +286,7 @@ def test_alias_import_with_bare_path_resolves(tmp_path):
     importer = _write(importer_dir / "page.ts",
                       "import type { X } from '$lib/type-helpers'\n")
     result = extract_js(importer)
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     assert expected in _import_targets(result), (
         f"Alias + bare-path resolution failed; "
         f"expected {expected}; got {_import_targets(result)}"
@@ -304,7 +305,7 @@ def test_type_only_import_with_bare_path_resolves(tmp_path):
     importer = _write(tmp_path / "page.ts",
                       "import type { GetNestedType } from './type-helpers'\n")
     result = extract_js(importer)
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     assert expected in _import_targets(result), (
         f"Type-only import with bare path failed to resolve; "
         f"expected {expected}; got {_import_targets(result)}"
@@ -342,7 +343,7 @@ def test_alias_directory_import_resolves_to_index_ts(tmp_path):
     importer = _write(src / "routes" / "page.ts",
                       "import { enqueue } from '$lib/queue'\n")
     result = extract_js(importer)
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     assert expected in _import_targets(result), (
         f"Alias + directory resolution failed; "
         f"expected {expected}; got {_import_targets(result)}"
@@ -428,7 +429,7 @@ def test_end_to_end_multi_dot_import_resolves(tmp_path):
     importer = _write(tmp_path / "page.ts",
                       "import { apply } from './tag-action.shared'\n")
     result = extract_js(importer)
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     assert expected in _import_targets(result), (
         f"Multi-dot import failed end-to-end; "
         f"expected {expected}; got {_import_targets(result)}"
@@ -448,7 +449,7 @@ def test_resolve_chain_alias_and_extension_compose(tmp_path):
     importer = _write(src / "routes" / "page.ts",
                       "import { isMobile } from '$lib/hooks/is-mobile.svelte'\n")
     result = extract_js(importer)
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     assert expected in _import_targets(result), (
         f"Alias + .svelte→.svelte.ts chain failed to compose; "
         f"expected {expected}; got {_import_targets(result)}"
@@ -473,7 +474,7 @@ def test_ts_dynamic_import_bare_path_resolves(tmp_path):
 }
 """)
     result = extract_js(importer)
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     targets = {str(e.get("target") or "") for e in result["edges"]
                if e.get("relation") in ("imports", "imports_from")}
     assert expected in targets, (
@@ -498,7 +499,7 @@ def test_ts_dynamic_import_alias_with_bare_path_resolves(tmp_path):
 }
 """)
     result = extract_js(importer)
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     targets = {str(e.get("target") or "") for e in result["edges"]
                if e.get("relation") in ("imports", "imports_from")}
     assert expected in targets, (
@@ -521,7 +522,7 @@ def test_dynamic_import_bare_path_resolves(tmp_path):
     result = extract_svelte(importer)
     dyn_targets = {str(e.get("target") or "") for e in result["edges"]
                    if e.get("relation") == "dynamic_import"}
-    expected = _make_id(str(target))
+    expected = _make_id(_file_stem(target))
     assert expected in dyn_targets, (
         f"dynamic_import of .svelte that's actually .svelte.ts must "
         f"resolve through the new resolver; "

From 7d85b092a8b3a9a43cfcb256468b776b8240a4d6 Mon Sep 17 00:00:00 2001
From: Kedar Vartak <kedar.vartak22@vit.edu>
Date: Thu, 28 May 2026 23:10:32 +0530
Subject: [PATCH 4/5] fix: limit legacy extension stripping to file nodes

---
 graphify/build.py   | 58 ++++++++++++++++++++++++++++++++++-----------
 tests/test_build.py | 20 +++++++++++++++-
 2 files changed, 63 insertions(+), 15 deletions(-)

diff --git a/graphify/build.py b/graphify/build.py
index 13880390d..fe654e0c6 100644
--- a/graphify/build.py
+++ b/graphify/build.py
@@ -76,7 +76,40 @@ def _normalize_id(s: str) -> str:
 
 
 def _canonical_id(s: str) -> str:
-    return _strip_ext_suffix(_normalize_id(str(s)))
+    return _normalize_id(str(s))
+
+
+def _file_stem_from_path(path_text: str) -> str:
+    path = Path(path_text.replace("\\", "/"))
+    parent = path.parent.name
+    if parent and parent not in (".", ""):
+        return f"{parent}.{path.stem}"
+    return path.stem
+
+
+def _looks_like_file_node(node: dict) -> bool:
+    label = str(node.get("label") or "")
+    source_file = str(node.get("source_file") or "")
+    if source_file and label == Path(source_file.replace("\\", "/")).name:
+        return True
+    return bool(label and Path(label).suffix)
+
+
+def _legacy_file_node_id(node: dict) -> str | None:
+    if not _looks_like_file_node(node):
+        return None
+    node_id = _canonical_id(str(node.get("id") or ""))
+    source_file = str(node.get("source_file") or "")
+    label = str(node.get("label") or "")
+    candidates: set[str] = set()
+    if source_file:
+        candidates.add(_normalize_id(source_file))
+    if label:
+        candidates.add(_strip_ext_suffix(node_id))
+    if node_id in candidates or _strip_ext_suffix(node_id) in candidates:
+        stem_source = source_file or label
+        return _normalize_id(_file_stem_from_path(stem_source))
+    return None
 
 
 def _norm_source_file(p: str | None, root: str | None = None) -> str | None:
@@ -132,11 +165,19 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
         extraction = dict(extraction, edges=extraction["links"])
 
     # Canonicalize legacy node/edge schema before validation.
+    id_remap: dict[str, str] = {}
     for node in extraction.get("nodes", []):
         if not isinstance(node, dict):
             continue
+        original_id = str(node.get("id") or "")
         if node.get("id") not in (None, ""):
             node["id"] = _canonical_id(str(node["id"]))
+        file_node_id = _legacy_file_node_id(node)
+        if file_node_id and file_node_id != node.get("id"):
+            id_remap[node["id"]] = file_node_id
+            if original_id:
+                id_remap[_canonical_id(original_id)] = file_node_id
+            node["id"] = file_node_id
         if "source" in node and "source_file" not in node:
             # Count edges that reference this node so the warning is actionable (#479)
             node_id = node.get("id", "?")
@@ -160,19 +201,6 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
         ft = node.get("file_type", "")
         if ft and ft not in {"code", "document", "paper", "image", "rationale", "concept"}:
             node["file_type"] = _FILE_TYPE_SYNONYMS.get(ft, "concept")
-        # Strip legacy extension suffix from file-level node IDs (e.g. script_pipeline_step_py
-        # → script_pipeline_step) so old cached graphs merge with new AST extractions (#952).
-        if "id" in node:
-            node["id"] = _strip_ext_suffix(node["id"])
-
-    for edge in extraction.get("edges", []):
-        if not isinstance(edge, dict):
-            continue
-        if "source" in edge:
-            edge["source"] = _strip_ext_suffix(edge["source"])
-        if "target" in edge:
-            edge["target"] = _strip_ext_suffix(edge["target"])
-
     for edge in extraction.get("edges", []):
         if not isinstance(edge, dict):
             continue
@@ -182,8 +210,10 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
             edge["target"] = edge["to"]
         if edge.get("source") not in (None, ""):
             edge["source"] = _canonical_id(str(edge["source"]))
+            edge["source"] = id_remap.get(edge["source"], edge["source"])
         if edge.get("target") not in (None, ""):
             edge["target"] = _canonical_id(str(edge["target"]))
+            edge["target"] = id_remap.get(edge["target"], edge["target"])
 
     errors = validate_extraction(extraction)
     # Dangling edges (stdlib/external imports) are expected - only warn about real schema errors.
diff --git a/tests/test_build.py b/tests/test_build.py
index 419cb63d1..24edaa2a4 100644
--- a/tests/test_build.py
+++ b/tests/test_build.py
@@ -349,7 +349,7 @@ def test_build_from_json_relative_source_file_unchanged(tmp_path):
 def test_build_from_json_strips_legacy_language_extension_suffixes():
     extraction = {
         "nodes": [
-            {"id": "script_pipeline_step_py", "label": "pipeline_step.py", "file_type": "code"},
+            {"id": "script_pipeline_step_py", "label": "pipeline_step.py", "file_type": "code", "source_file": "script/pipeline_step.py"},
             {"id": "consumer", "label": "consumer", "file_type": "code"},
         ],
         "edges": [
@@ -363,6 +363,24 @@ def test_build_from_json_strips_legacy_language_extension_suffixes():
     assert G.has_edge("consumer", "script_pipeline_step")
 
 
+def test_build_from_json_does_not_strip_symbol_ids_that_look_like_extensions():
+    extraction = {
+        "nodes": [
+            {"id": "format_c", "label": "format_c()", "file_type": "code", "source_file": "script/pipeline_step.py"},
+            {"id": "parse_json", "label": "parse_json()", "file_type": "code", "source_file": "script/pipeline_step.py"},
+            {"id": "test_py", "label": "test_py()", "file_type": "code", "source_file": "script/pipeline_step.py"},
+        ],
+        "edges": [
+            {"source": "format_c", "target": "parse_json", "relation": "calls", "confidence": "EXTRACTED"},
+            {"source": "test_py", "target": "format_c", "relation": "calls", "confidence": "EXTRACTED"},
+        ],
+    }
+    G = build_from_json(extraction)
+    assert {"format_c", "parse_json", "test_py"}.issubset(G.nodes)
+    assert G.has_edge("format_c", "parse_json")
+    assert G.has_edge("test_py", "format_c")
+
+
 def test_build_merge_prune_absolute_paths_match_relative_nodes(tmp_path):
     """#1007: manifest stores absolute paths, graph nodes store relative paths.
     prune_sources with absolute paths must still remove the right nodes and edges."""

From 4b621e88b497ad12c4a93c10915f9a83b3b01a41 Mon Sep 17 00:00:00 2001
From: Kedar Vartak <kedar.vartak22@vit.edu>
Date: Thu, 28 May 2026 23:12:37 +0530
Subject: [PATCH 5/5] test: keep rationale schema unchanged in id fix

---
 tests/test_rationale.py | 118 +++++++++++++++++++++-------------------
 1 file changed, 63 insertions(+), 55 deletions(-)

diff --git a/tests/test_rationale.py b/tests/test_rationale.py
index 70d9b9c01..b52aa3909 100644
--- a/tests/test_rationale.py
+++ b/tests/test_rationale.py
@@ -1,8 +1,4 @@
-"""Tests for rationale/docstring extraction in extract.py.
-
-Rationale is now stored as a 'rationale' attribute on the parent node
-rather than as separate rationale nodes with rationale_for edges.
-"""
+"""Tests for rationale/docstring extraction in extract.py."""
 import textwrap
 from pathlib import Path
 import pytest
@@ -16,19 +12,15 @@ def _write_py(tmp_path: Path, code: str) -> Path:
     return p
 
 
-def _nodes_with_rationale(result):
-    return [n for n in result["nodes"] if n.get("rationale")]
-
-
 def test_module_docstring_extracted(tmp_path):
     path = _write_py(tmp_path, '''
         """This module handles authentication because legacy sessions were insecure."""
         def login(): pass
     ''')
     result = extract_python(path)
-    nodes = _nodes_with_rationale(result)
-    assert len(nodes) >= 1
-    assert any("authentication" in n["rationale"] for n in nodes)
+    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
+    assert len(rationale) >= 1
+    assert any("authentication" in n["label"] for n in rationale)
 
 
 def test_function_docstring_extracted(tmp_path):
@@ -38,8 +30,8 @@ def process():
             pass
     ''')
     result = extract_python(path)
-    nodes = _nodes_with_rationale(result)
-    assert any("chunked" in n["rationale"] for n in nodes)
+    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
+    assert any("chunked" in n["label"] for n in rationale)
 
 
 def test_class_docstring_extracted(tmp_path):
@@ -49,8 +41,8 @@ class Cache:
             pass
     ''')
     result = extract_python(path)
-    nodes = _nodes_with_rationale(result)
-    assert any("Redis" in n["rationale"] for n in nodes)
+    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
+    assert any("Redis" in n["label"] for n in rationale)
 
 
 def test_rationale_comment_extracted(tmp_path):
@@ -60,12 +52,11 @@ def build():
             pass
     ''')
     result = extract_python(path)
-    nodes = _nodes_with_rationale(result)
-    assert any("NOTE" in n["rationale"] for n in nodes)
+    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
+    assert any("NOTE" in n["label"] for n in rationale)
 
 
-def test_no_rationale_for_edges(tmp_path):
-    """Rationale is now an attribute, not a node+edge — no rationale_for edges."""
+def test_rationale_for_edges_present(tmp_path):
     path = _write_py(tmp_path, '''
         """Module docstring explaining the why."""
         def foo():
@@ -74,31 +65,29 @@ def foo():
     ''')
     result = extract_python(path)
     rationale_edges = [e for e in result["edges"] if e.get("relation") == "rationale_for"]
-    assert len(rationale_edges) == 0
+    assert len(rationale_edges) >= 1
 
 
-def test_no_rationale_nodes(tmp_path):
-    """No separate rationale nodes are emitted — rationale lives on parent nodes."""
+def test_short_docstring_ignored(tmp_path):
+    """Trivial docstrings under 20 chars should not become rationale nodes."""
     path = _write_py(tmp_path, '''
-        """Module docstring explaining the why."""
         def foo():
-            """Function docstring with rationale."""
+            """Constructor."""
             pass
     ''')
     result = extract_python(path)
-    rationale_nodes = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
-    assert len(rationale_nodes) == 0
+    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
+    assert len(rationale) == 0
 
 
-def test_short_docstring_ignored(tmp_path):
-    """Trivial docstrings under 20 chars should not set the rationale attribute."""
+def test_rationale_confidence_is_extracted(tmp_path):
     path = _write_py(tmp_path, '''
-        def foo():
-            """Constructor."""
-            pass
+        """This module exists because we needed a standalone parser."""
+        def parse(): pass
     ''')
     result = extract_python(path)
-    assert len(_nodes_with_rationale(result)) == 0
+    rationale_edges = [e for e in result["edges"] if e.get("relation") == "rationale_for"]
+    assert all(e.get("confidence") == "EXTRACTED" for e in rationale_edges)
 
 
 def test_alembic_module_docstring_suppressed(tmp_path):
@@ -120,8 +109,8 @@ def downgrade():
             pass
     ''')
     result = extract_python(path)
-    nodes = _nodes_with_rationale(result)
-    assert not any("Revision ID" in n["rationale"] for n in nodes)
+    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
+    assert not any("Revision ID" in n["label"] for n in rationale)
 
 
 def test_alembic_function_docstrings_still_extracted(tmp_path):
@@ -139,9 +128,11 @@ def downgrade():
             pass
     ''')
     result = extract_python(path)
-    nodes = _nodes_with_rationale(result)
-    assert not any("Revision ID" in n["rationale"] for n in nodes)
-    assert any("auth" in n["rationale"] for n in nodes)
+    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
+    # module docstring suppressed
+    assert not any("Revision ID" in n["label"] for n in rationale)
+    # function docstring still captured
+    assert any("auth" in n["label"] for n in rationale)
 
 
 def test_non_migration_revision_var_not_suppressed(tmp_path):
@@ -153,8 +144,8 @@ def test_non_migration_revision_var_not_suppressed(tmp_path):
         def get_revision(): pass
     ''')
     result = extract_python(path)
-    nodes = _nodes_with_rationale(result)
-    assert any("audit history" in n["rationale"] for n in nodes)
+    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
+    assert any("audit history" in n["label"] for n in rationale)
 
 
 def test_django_migration_module_docstring_suppressed(tmp_path):
@@ -167,8 +158,8 @@ class Migration(migrations.Migration):
             operations = []
     ''')
     result = extract_python(path)
-    nodes = _nodes_with_rationale(result)
-    assert not any("post_priority" in n["rationale"] for n in nodes)
+    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
+    assert not any("post_priority" in n["label"] for n in rationale)
 
 
 def test_generated_file_module_docstring_suppressed(tmp_path):
@@ -180,15 +171,16 @@ class UserMessage:
             pass
     ''')
     result = extract_python(path)
-    nodes = _nodes_with_rationale(result)
-    assert not any("protocol buffer" in n["rationale"].lower() for n in nodes)
+    rationale = [n for n in result["nodes"] if n.get("file_type") == "rationale"]
+    assert not any("protocol buffer" in n["label"].lower() for n in rationale)
 
 
 def test_decorated_method_node_id_is_class_qualified(tmp_path):
     """Regression for #1050: @property / @staticmethod / @classmethod methods
-    were emitted with a class-unqualified node id (e.g. ``file_baz``). The
-    rationale walker uses class-qualified ids, so the docstring rationale must
-    land on the same method node id.
+    were emitted with a class-unqualified node id (e.g. ``file_baz``) while the
+    rationale walker emitted the class-qualified id (``file_bar_baz``) as the
+    docstring's edge target. The mismatch caused ``build_from_json`` to drop
+    the rationale_for edge as dangling, orphaning the docstring node.
     """
     path = _write_py(tmp_path, '''
         class Bar:
@@ -222,7 +214,7 @@ def normal(self) -> int:
     assert normal_id.endswith("_bar_normal"), normal_id
 
     # Each decorated method must share the same class-qualified id shape so the
-    # extracted rationale lands on the actual method node.
+    # rationale_for edge target matches the method node id.
     for decorated_name in ("baz", "helper", "factory"):
         matches = [nid for nid, n in nodes_by_id.items()
                    if n.get("label") == f".{decorated_name}()"]
@@ -239,17 +231,33 @@ def normal(self) -> int:
             f"the class-qualified id"
         )
 
-    # Rationale is stored directly on method nodes, not as rationale_for edges.
+    # Every rationale_for edge's target must resolve to an actual node in the
+    # extraction (no dangling edges into phantom unqualified ids).
+    node_ids = set(nodes_by_id.keys())
+    rationale_edges = [e for e in result["edges"] if e.get("relation") == "rationale_for"]
+    for edge in rationale_edges:
+        assert edge["target"] in node_ids, (
+            f"rationale_for edge targets missing node id {edge['target']!r}"
+        )
+
+    # After build_from_json, each decorated-method docstring node must be
+    # connected (degree > 0), not an orphan dropped from the graph.
     g = build_from_json(result)
     for decorated_name in ("baz", "helper", "factory", "normal"):
         method_id = next(
             nid for nid, n in nodes_by_id.items()
             if n.get("label") == f".{decorated_name}()"
         )
-        assert nodes_by_id[method_id].get("rationale"), (
-            f"method node for ``.{decorated_name}()`` is missing docstring rationale"
-        )
-        assert method_id in g.nodes, f"method node {method_id} missing from graph"
-        assert g.nodes[method_id].get("rationale"), (
-            f"method node {method_id} lost rationale after build_from_json"
+        # Find rationale node attached to this method.
+        attached_rationale = [
+            e["source"] for e in rationale_edges if e["target"] == method_id
+        ]
+        assert attached_rationale, (
+            f"no rationale_for edge found for ``.{decorated_name}()`` method"
         )
+        for r_id in attached_rationale:
+            assert r_id in g.nodes, f"rationale node {r_id} missing from graph"
+            assert g.degree(r_id) > 0, (
+                f"rationale node {r_id} for ``.{decorated_name}()`` is orphaned "
+                f"(degree 0) after build_from_json"
+            )