diff --git a/graphify/build.py b/graphify/build.py index a4d33c5e0..fe654e0c6 100644 --- a/graphify/build.py +++ b/graphify/build.py @@ -51,6 +51,16 @@ } +_LANG_EXT_SUFFIX_RE = re.compile( + r"_(?:py|pyi|pyw|js|jsx|mjs|cjs|ts|tsx|vue|svelte|astro|go|rs|java|kt|kts|scala|groovy|c|h|cc|cpp|cxx|hh|hpp|hxx|rb|php|cs|swift|lua|r|sh|bash|zsh|fish|ps1|sql|html|css|scss|sass|less|md|mdx|json|yaml|yml|toml|xml|proto|graphql|gql|dart|ex|exs|erl|hrl|clj|cljs|fs|fsx|vb|pl|pm|pas|pp|dpr|dpk|inc|sol|move|sv|svh|v)$" +) + + +def _strip_ext_suffix(s: str) -> str: + """Drop legacy extension suffixes from path-derived IDs (#1033).""" + return _LANG_EXT_SUFFIX_RE.sub("", s) + + def _normalize_id(s: str) -> str: r"""Normalize an ID string the same way extract._make_id does. @@ -65,6 +75,43 @@ def _normalize_id(s: str) -> str: return cleaned.strip("_").casefold() +def _canonical_id(s: str) -> str: + return _normalize_id(str(s)) + + +def _file_stem_from_path(path_text: str) -> str: + path = Path(path_text.replace("\\", "/")) + parent = path.parent.name + if parent and parent not in (".", ""): + return f"{parent}.{path.stem}" + return path.stem + + +def _looks_like_file_node(node: dict) -> bool: + label = str(node.get("label") or "") + source_file = str(node.get("source_file") or "") + if source_file and label == Path(source_file.replace("\\", "/")).name: + return True + return bool(label and Path(label).suffix) + + +def _legacy_file_node_id(node: dict) -> str | None: + if not _looks_like_file_node(node): + return None + node_id = _canonical_id(str(node.get("id") or "")) + source_file = str(node.get("source_file") or "") + label = str(node.get("label") or "") + candidates: set[str] = set() + if source_file: + candidates.add(_normalize_id(source_file)) + if label: + candidates.add(_strip_ext_suffix(node_id)) + if node_id in candidates or _strip_ext_suffix(node_id) in candidates: + stem_source = source_file or label + return _normalize_id(_file_stem_from_path(stem_source)) + return None + + def _norm_source_file(p: str | None, root: str | None = None) -> str | None: """Normalize path separators and relativize absolute paths. @@ -118,9 +165,19 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat extraction = dict(extraction, edges=extraction["links"]) # Canonicalize legacy node/edge schema before validation. + id_remap: dict[str, str] = {} for node in extraction.get("nodes", []): if not isinstance(node, dict): continue + original_id = str(node.get("id") or "") + if node.get("id") not in (None, ""): + node["id"] = _canonical_id(str(node["id"])) + file_node_id = _legacy_file_node_id(node) + if file_node_id and file_node_id != node.get("id"): + id_remap[node["id"]] = file_node_id + if original_id: + id_remap[_canonical_id(original_id)] = file_node_id + node["id"] = file_node_id if "source" in node and "source_file" not in node: # Count edges that reference this node so the warning is actionable (#479) node_id = node.get("id", "?") @@ -144,6 +201,19 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat ft = node.get("file_type", "") if ft and ft not in {"code", "document", "paper", "image", "rationale", "concept"}: node["file_type"] = _FILE_TYPE_SYNONYMS.get(ft, "concept") + for edge in extraction.get("edges", []): + if not isinstance(edge, dict): + continue + if "source" not in edge and "from" in edge: + edge["source"] = edge["from"] + if "target" not in edge and "to" in edge: + edge["target"] = edge["to"] + if edge.get("source") not in (None, ""): + edge["source"] = _canonical_id(str(edge["source"])) + edge["source"] = id_remap.get(edge["source"], edge["source"]) + if edge.get("target") not in (None, ""): + edge["target"] = _canonical_id(str(edge["target"])) + edge["target"] = id_remap.get(edge["target"], edge["target"]) errors = validate_extraction(extraction) # Dangling edges (stdlib/external imports) are expected - only warn about real schema errors. @@ -159,7 +229,7 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat # Normalized ID map: lets edges survive when the LLM generates IDs with # slightly different casing or punctuation than the AST extractor. # e.g. "Session_ValidateToken" maps to "session_validatetoken". - norm_to_id: dict[str, str] = {_normalize_id(nid): nid for nid in node_set} + norm_to_id: dict[str, str] = {_canonical_id(nid): nid for nid in node_set} # Iterate edges in a deterministic order. The graph is undirected and stores # direction in _src/_tgt; when two edges collapse onto the same node pair the # last write wins, so an unstable iteration order flips _src/_tgt run-to-run @@ -181,9 +251,9 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat src, tgt = edge["source"], edge["target"] # Remap mismatched IDs via normalization before dropping the edge. if src not in node_set: - src = norm_to_id.get(_normalize_id(src), src) + src = norm_to_id.get(_canonical_id(src), src) if tgt not in node_set: - tgt = norm_to_id.get(_normalize_id(tgt), tgt) + tgt = norm_to_id.get(_canonical_id(tgt), tgt) if src not in node_set or tgt not in node_set: continue # skip edges to external/stdlib nodes - expected, not an error attrs = {k: v for k, v in edge.items() if k not in ("source", "target")} diff --git a/graphify/extract.py b/graphify/extract.py index c2443f1b1..c379c933c 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -73,13 +73,29 @@ def _make_id(*parts: str) -> str: return cleaned.strip("_").casefold() +# Inner extensions that appear in compound filenames (e.g. Button.svelte.ts). +# Stripping them here ensures is-mobile.svelte.ts and is-mobile.svelte share +# the same base ID, while leaving symbol names like format_c or test_py untouched. +_COMPOUND_INNER_EXTS = frozenset({".svelte", ".vue", ".astro"}) + + def _file_stem(path: Path) -> str: - """Return a stem qualified with the parent directory name to avoid ID collisions - when multiple files share the same filename in different directories (#550).""" + """Return a normalised stem qualified with the parent directory name. + + Strips compound inner extensions (e.g. .svelte from Button.svelte.ts) so + that the runes/TypeScript variant of a component produces the same base ID + as the component file itself. Avoids ID collisions for same-named files in + different directories (#550). + """ + stem = path.stem + # Strip compound inner extension: "is-mobile.svelte" → "is-mobile" + inner = Path(stem).suffix + if inner in _COMPOUND_INNER_EXTS: + stem = Path(stem).stem parent = path.parent.name if parent and parent not in (".", ""): - return f"{parent}.{path.stem}" - return path.stem + return f"{parent}.{stem}" + return stem _TSCONFIG_ALIAS_CACHE: dict[str, dict[str, str]] = {} @@ -736,7 +752,7 @@ def _import_python(node, source: bytes, file_nid: str, stem: str, edges: list, s for _ in range(dots - 1): base = base.parent rel = (module_name.replace(".", "/") + ".py") if module_name else "__init__.py" - tgt_nid = _make_id(str(base / rel)) + tgt_nid = _make_id(_file_stem(base / rel)) else: tgt_nid = _make_id(raw) edges.append({ @@ -762,7 +778,7 @@ def _resolve_js_import_target(raw: str, str_path: str) -> "tuple[str, Path | Non return None resolved_path = _resolve_js_module_path(raw, Path(str_path).parent) if resolved_path is not None: - return _make_id(str(resolved_path)), resolved_path + return _make_id(_file_stem(resolved_path)), resolved_path module_name = raw.split("/")[-1] if not module_name: return None @@ -980,7 +996,7 @@ def _import_c(node, source: bytes, file_nid: str, stem: str, edges: list, str_pa if child.type != "system_lib_string": resolved = _resolve_c_include_path(raw, str_path) if resolved is not None: - tgt_nid = _make_id(str(resolved)) + tgt_nid = _make_id(_file_stem(resolved)) edges.append({ "source": file_nid, "target": tgt_nid, @@ -1708,7 +1724,7 @@ def ensure_named_node(name: str, line: int) -> str: add_node(nid, name, line) return nid - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def walk(node, parent_class_nid: str | None = None) -> None: @@ -2564,7 +2580,7 @@ def _extract_python_rationale(path: Path, result: dict) -> None: nodes = result["nodes"] edges = result["edges"] seen_ids = {n["id"] for n in nodes} - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) def _get_docstring(body_node) -> tuple[str, int] | None: if not body_node: @@ -2582,25 +2598,11 @@ def _get_docstring(body_node) -> tuple[str, int] | None: def _add_rationale(text: str, line: int, parent_nid: str) -> None: label = text[:80].replace("\r\n", " ").replace("\r", " ").replace("\n", " ").strip() - rid = _make_id(stem, "rationale", str(line)) - if rid not in seen_ids: - seen_ids.add(rid) - nodes.append({ - "id": rid, - "label": label, - "file_type": "rationale", - "source_file": str_path, - "source_location": f"L{line}", - }) - edges.append({ - "source": rid, - "target": parent_nid, - "relation": "rationale_for", - "confidence": "EXTRACTED", - "source_file": str_path, - "source_location": f"L{line}", - "weight": 1.0, - }) + for n in nodes: + if n["id"] == parent_nid: + if "rationale" not in n: + n["rationale"] = label + break # Module-level docstring — skip for auto-generated files (Alembic, Django # migrations, protobuf stubs, etc.) whose module docstrings are revision @@ -2681,23 +2683,22 @@ def extract_svelte(path: Path) -> dict: import re as _re src = path.read_text(encoding="utf-8", errors="replace") existing_ids = {n["id"] for n in result.get("nodes", [])} - # Source file node ID must match the one _extract_generic creates: - # _make_id(str(path)) - single arg, no stem prefix. Otherwise the source - # endpoint is a phantom node and build_from_json drops the edge (#701). - file_node_id = _make_id(str(path)) + # Source file node ID must match the one _extract_generic creates via + # _make_id(_file_stem(path)); otherwise this edge points at a phantom node (#701). + file_node_id = _make_id(_file_stem(path)) aliases = _load_tsconfig_aliases(path.parent) for m in _re.finditer(r"""import\(\s*['"]([^'"]+)['"]\s*\)""", src): raw = m.group(1) if not raw: continue if raw.startswith("."): - # Relative import - resolve to full path so IDs match file node IDs. + # Relative import - resolve via _file_stem so IDs match file node IDs. resolved = Path(os.path.normpath(path.parent / raw)) # Apply same TS/Svelte resolver fixups as static imports so dynamic # imports of bare paths and .svelte.ts rune files land on real # file nodes instead of phantom ids (#716). resolved = _resolve_js_module_path(resolved) - node_id = _make_id(str(resolved)) + node_id = _make_id(_file_stem(resolved)) stub_source_file = str(resolved) else: # Check tsconfig.json path aliases (e.g. "$lib/" -> "src/lib/", "@/" -> "src/") @@ -2711,7 +2712,7 @@ def extract_svelte(path: Path) -> dict: break if resolved_alias is not None: resolved_alias = _resolve_js_module_path(resolved_alias) - node_id = _make_id(str(resolved_alias)) + node_id = _make_id(_file_stem(resolved_alias)) stub_source_file = str(resolved_alias) else: # Bare/scoped import (node_modules) - use last segment; @@ -2763,7 +2764,7 @@ def extract_svelte(path: Path) -> dict: resolved = resolved.with_suffix(".ts") elif resolved.suffix == ".jsx": resolved = resolved.with_suffix(".tsx") - node_id = _make_id(str(resolved)) + node_id = _make_id(_file_stem(resolved)) stub_source_file = str(resolved) else: resolved_alias = None @@ -2773,7 +2774,7 @@ def extract_svelte(path: Path) -> dict: resolved_alias = Path(os.path.normpath(Path(alias_base) / rest)) break if resolved_alias is not None: - node_id = _make_id(str(resolved_alias)) + node_id = _make_id(_file_stem(resolved_alias)) stub_source_file = str(resolved_alias) else: module_name = raw.split("/")[-1] @@ -2822,7 +2823,7 @@ def extract_astro(path: Path) -> dict: import re as _re src = path.read_text(encoding="utf-8", errors="replace") existing_ids = {n["id"] for n in result.get("nodes", [])} - file_node_id = _make_id(str(path)) + file_node_id = _make_id(_file_stem(path)) aliases = _load_tsconfig_aliases(path.parent) # Dynamic imports anywhere in the file: `import('./X.astro')` is legal in # frontmatter setup code and inside expression slots. @@ -2833,7 +2834,7 @@ def extract_astro(path: Path) -> dict: if raw.startswith("."): resolved = Path(os.path.normpath(path.parent / raw)) resolved = _resolve_js_module_path(resolved) - node_id = _make_id(str(resolved)) + node_id = _make_id(_file_stem(resolved)) stub_source_file = str(resolved) else: resolved_alias = None @@ -2844,7 +2845,7 @@ def extract_astro(path: Path) -> dict: break if resolved_alias is not None: resolved_alias = _resolve_js_module_path(resolved_alias) - node_id = _make_id(str(resolved_alias)) + node_id = _make_id(_file_stem(resolved_alias)) stub_source_file = str(resolved_alias) else: module_name = raw.split("/")[-1] @@ -2899,7 +2900,7 @@ def extract_astro(path: Path) -> dict: resolved = resolved.with_suffix(".ts") elif resolved.suffix == ".jsx": resolved = resolved.with_suffix(".tsx") - node_id = _make_id(str(resolved)) + node_id = _make_id(_file_stem(resolved)) stub_source_file = str(resolved) else: resolved_alias = None @@ -2909,7 +2910,7 @@ def extract_astro(path: Path) -> dict: resolved_alias = Path(os.path.normpath(Path(alias_base) / rest)) break if resolved_alias is not None: - node_id = _make_id(str(resolved_alias)) + node_id = _make_id(_file_stem(resolved_alias)) stub_source_file = str(resolved_alias) else: module_name = raw.split("/")[-1] @@ -3009,7 +3010,7 @@ def _add_edge(src: str, tgt: str, relation: str, line: int, plain_method_re = _re.compile(r"""^\s*def\s+(\w+)\s*\(""") current_class_nid: str | None = None - file_nid = _make_id(str_path) + file_nid = _make_id(_file_stem(path)) # Ensure the file node exists (tree-sitter pass may have emitted it) if file_nid not in seen_ids: @@ -3104,7 +3105,7 @@ def extract_blade(path: Path) -> dict: except OSError: return {"error": f"cannot read {path}"} - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) nodes = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str(path), "source_location": None}] edges = [] @@ -3152,7 +3153,7 @@ def extract_dart(path: Path) -> dict: # Use stem (not str(path)) for child IDs to keep them machine-independent. stem = _file_stem(path) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) nodes = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str(path), "source_location": None}] edges = [] @@ -3234,7 +3235,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, "confidence": confidence, "confidence_score": score, "source_file": str_path, "source_location": f"L{line}", "weight": 1.0}) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def walk(node, module_nid: str | None = None) -> None: @@ -3321,7 +3322,7 @@ def extract_sql(path: Path) -> dict: stem = _file_stem(path) str_path = str(path) - file_nid = _make_id(str_path) + file_nid = _make_id(_file_stem(path)) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": None}] edges: list[dict] = [] @@ -3629,7 +3630,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def _func_name_from_signature(sig_node) -> str | None: @@ -3881,7 +3882,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def _fortran_name(stmt_node) -> str | None: @@ -4053,7 +4054,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def walk(node) -> None: @@ -4281,7 +4282,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def walk(node, parent_impl_nid: str | None = None) -> None: @@ -4460,7 +4461,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def _extract_import(node) -> None: @@ -4630,7 +4631,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) _PS_SKIP = frozenset({ @@ -4983,7 +4984,7 @@ def _apply_symbol_resolution_facts( return path_by_resolved = {path.resolve(): path for path in paths} - source_file_id = {path.resolve(): _make_id(str(path)) for path in paths} + source_file_id = {path.resolve(): _make_id(_file_stem(path)) for path in paths} symbol_nodes: dict[tuple[Path, str], str] = {} for node in nodes: source_path = _js_source_path(str(node.get("source_file", "")), root) @@ -5074,7 +5075,7 @@ def add_edge(source: str, target: str, relation: str, context: str, line: int, s if source_id is not None: add_edge( source_id, - _make_id(str(path_by_resolved.get(target_path, target_path))), + _make_id(_file_stem(path_by_resolved.get(target_path, target_path))), "re_exports", "export", star_fact.line, @@ -5098,7 +5099,7 @@ def add_edge(source: str, target: str, relation: str, context: str, line: int, s if source_id is not None: add_edge( source_id, - _make_id(str(path_by_resolved.get(origin[0], origin[0]))), + _make_id(_file_stem(path_by_resolved.get(origin[0], origin[0]))), "re_exports", "export", export_fact.line, @@ -6082,7 +6083,7 @@ def _resolve_cross_file_java_imports( new_edges: list[dict] = [] seen_pairs: set[tuple[str, str]] = set() for path in paths: - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) try: source = path.read_bytes() tree = parser.parse(source) @@ -6169,7 +6170,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def _read(node) -> str: @@ -6371,7 +6372,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) _IMPORT_KEYWORDS = frozenset({"alias", "import", "require", "use"}) @@ -6563,7 +6564,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, "confidence": confidence, "source_file": str_path, "source_location": f"L{line}", "weight": weight}) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) # Track heading stack for nesting: [(level, nid), ...] @@ -6673,7 +6674,7 @@ def _pascal_resolve_unit(from_path: Path, unit_name: str) -> str: """Resolve a Pascal unit name to the graphify node ID of its source file. Scans all Pascal files under the project root (the highest ancestor that - directly contains .pas/.dpr files) and returns _make_id(str(matched_path)). + directly contains .pas/.dpr files) and returns _make_id(_file_stem(matched_path)). Result is cached per project root so the rglob runs at most once per project. Falls back to _make_id(unit_name) for units not found on disk (e.g. standard RTL units like SysUtils, Windows). @@ -6684,7 +6685,7 @@ def _pascal_resolve_unit(from_path: Path, unit_name: str) -> str: unit_map: dict[str, str] = {} for ext in (".pas", ".pp", ".dpr", ".dpk", ".inc"): for f in root.rglob("*" + ext): - unit_map[f.stem.lower()] = _make_id(str(f)) + unit_map[f.stem.lower()] = _make_id(_file_stem(f)) _pascal_unit_cache[root_key] = unit_map return _pascal_unit_cache[root_key].get(unit_name.lower(), _make_id(unit_name)) @@ -6901,7 +6902,7 @@ def _add_edge(src: str, tgt: str, relation: str, line: int, context: str | None def _lineno(text: str, offset: int) -> int: return text.count("\n", 0, offset) + 1 - file_nid = _make_id(str_path) + file_nid = _make_id(_file_stem(path)) _add_node(file_nid, path.name, 1) stripped = _pascal_strip_comments(raw) @@ -7076,7 +7077,7 @@ def add_edge( edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) module_nid = file_nid @@ -7299,7 +7300,7 @@ def add_edge( edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) obj_re = re.compile(r"^\s*object\s+\w+\s*:\s*(\w+)", re.IGNORECASE) @@ -7399,7 +7400,7 @@ def add_edge( edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) obj_re = re.compile(r"^\s*object\s+\w+\s*:\s*(\w+)", re.IGNORECASE) @@ -7511,7 +7512,7 @@ def add_edge(src: str, tgt: str, relation: str, context: str | None = None) -> N edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name) name_elem = xml_root.find(".//Package/Name") @@ -7610,9 +7611,9 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) - # file_nid is fully path-derived and never produced by _make_id(stem, func_name), - # so appending "__entry" guarantees a distinct ID from any function node. + file_nid = _make_id(_file_stem(path)) + # file_nid is the file-level parent/stem ID, so appending "__entry" + # guarantees a distinct ID from any function node. entry_nid = file_nid + "__entry" add_node(file_nid, path.name, 1, kind="file") add_node(entry_nid, f"{path.name} script", 1, kind="bash_entrypoint") @@ -7733,7 +7734,7 @@ def walk(node, parent_nid: str) -> None: # like `source ../../etc/passwd` that traverse outside # the project tree (B-1). if resolved.exists(): - tgt_nid = _make_id(str(resolved)) + tgt_nid = _make_id(_file_stem(resolved)) add_edge(file_nid, tgt_nid, "imports_from", line, context="import") else: @@ -7796,7 +7797,7 @@ def extract_sln(path: Path) -> dict: except OSError: return {"nodes": [], "edges": [], "error": f"cannot read {path}"} - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) str_path = str(path) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": None}] @@ -7883,7 +7884,7 @@ def extract_csproj(path: Path) -> dict: except ET.ParseError as e: return {"nodes": [], "edges": [], "error": f"XML parse error: {e}"} - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) str_path = str(path) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": None}] @@ -7984,7 +7985,7 @@ def extract_razor(path: Path) -> dict: except OSError: return {"nodes": [], "edges": [], "error": f"cannot read {path}"} - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) str_path = str(path) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": None}] @@ -8143,7 +8144,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def _key_text(pair_node) -> str | None: @@ -8279,7 +8280,7 @@ def add_edge(src: str, tgt: str, relation: str, line: int, edge["context"] = context edges.append(edge) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) add_node(file_nid, path.name, 1) def _type_path_text(node) -> str: @@ -8320,7 +8321,7 @@ def walk(node, parent_type_path: "str | None" = None, resolved = (path.parent / norm).resolve() edge: dict = { "source": file_nid, - "target": _make_id(str(resolved)) if resolved.exists() else _make_id(norm), + "target": _make_id(_file_stem(resolved)) if resolved.exists() else _make_id(norm), "relation": "imports_from" if resolved.exists() else "imports", "context": "import", "confidence": "EXTRACTED", @@ -8518,7 +8519,7 @@ def extract_dmi(path: Path) -> dict: str_path = str(path) stem = _file_stem(path) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": "L1"}] edges: list[dict] = [] @@ -8617,7 +8618,7 @@ def extract_dmm(path: Path) -> dict: return {"nodes": [], "edges": [], "error": str(e)} str_path = str(path) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": "L1"}] edges: list[dict] = [] @@ -8690,7 +8691,7 @@ def extract_dmf(path: Path) -> dict: str_path = str(path) stem = _file_stem(path) - file_nid = _make_id(str(path)) + file_nid = _make_id(_file_stem(path)) nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "code", "source_file": str_path, "source_location": "L1"}] edges: list[dict] = [] @@ -9085,17 +9086,19 @@ def extract( _augment_symbol_resolution_edges(paths, all_nodes, all_edges, root) - # Remap file node IDs from absolute-path-derived to project-relative so - # graph.json edge endpoints are stable across machines (#502) + # Remap legacy full-path-derived file node IDs to parent/stem IDs so cached + # per-file extractions converge with fresh output (#1033). id_remap: dict[str, str] = {} for path in paths: - old_id = _make_id(str(path)) + new_id = _make_id(_file_stem(path)) + legacy_ids = {_make_id(os.fspath(path))} try: - new_id = _make_id(str(path.relative_to(root))) + legacy_ids.add(_make_id(os.fspath(path.relative_to(root)))) except ValueError: - continue - if old_id != new_id: - id_remap[old_id] = new_id + pass + for old_id in legacy_ids: + if old_id != new_id: + id_remap[old_id] = new_id if id_remap: for n in all_nodes: if n.get("id") in id_remap: @@ -9166,7 +9169,7 @@ def extract( # Map each node back to its containing file_id so we can ask # "did the caller's file import the callee's file?" - # Use relativized paths to match how file node IDs were remapped above (#502). + # Use parent/stem file IDs to match file nodes after normalization (#1033). nid_to_file_nid: dict[str, str] = {} for n in all_nodes: sf = n.get("source_file") @@ -9177,7 +9180,7 @@ def extract( sf_rel = sf_path.relative_to(root) if sf_path.is_absolute() else sf_path except ValueError: sf_rel = sf_path - nid_to_file_nid[n["id"]] = _make_id(str(sf_rel)) + nid_to_file_nid[n["id"]] = _make_id(_file_stem(sf_rel)) existing_pairs = {(e["source"], e["target"]) for e in all_edges} for rc in all_raw_calls: diff --git a/graphify/symbol_resolution.py b/graphify/symbol_resolution.py index 7bc68093a..1f044f95a 100644 --- a/graphify/symbol_resolution.py +++ b/graphify/symbol_resolution.py @@ -365,14 +365,17 @@ def _bash_make_id(*parts: str) -> str: return cleaned.strip("_").casefold() +def _file_stem(path: Path) -> str: + parent = path.parent.name + if parent and parent not in (".", ""): + return f"{parent}.{path.stem}" + return path.stem + + def _file_node_id_for_path(path: Path, root: Path) -> str: - # Resolve both sides so callers that pass relative or non-canonical roots - # get the same canonical relative path that extract()'s id_remap produces. - # _bash_make_id is an exact copy of extract._make_id, so IDs match. - try: - return _bash_make_id(str(path.resolve().relative_to(root.resolve()))) - except ValueError: - return _bash_make_id(str(path)) # path outside root: hash absolute path as fallback + # Keep Bash source-edge file IDs in sync with extract._file_stem-based + # file node IDs (#1033). + return _bash_make_id(_file_stem(path)) def resolve_bash_source_edges( diff --git a/tests/test_astro_extraction.py b/tests/test_astro_extraction.py index c21e66c24..9a50c0f4c 100644 --- a/tests/test_astro_extraction.py +++ b/tests/test_astro_extraction.py @@ -13,6 +13,7 @@ from graphify.detect import CODE_EXTENSIONS from graphify.extract import ( + _file_stem, _make_id, extract_astro, ) @@ -57,8 +58,8 @@ def test_extract_astro_picks_up_frontmatter_static_imports(tmp_path): result = extract_astro(page) targets = _import_targets(result, relation="imports_from") - assert _make_id(str(layout)) in targets - assert _make_id(str(hero)) in targets + assert _make_id(_file_stem(layout)) in targets + assert _make_id(_file_stem(hero)) in targets def test_extract_astro_handles_dynamic_import_in_frontmatter(tmp_path): @@ -75,7 +76,7 @@ def test_extract_astro_handles_dynamic_import_in_frontmatter(tmp_path): result = extract_astro(page) targets = _import_targets(result, relation="dynamic_import") - assert _make_id(str(other)) in targets + assert _make_id(_file_stem(other)) in targets def test_extract_astro_picks_up_client_side_script_imports(tmp_path): @@ -100,8 +101,8 @@ def test_extract_astro_picks_up_client_side_script_imports(tmp_path): result = extract_astro(page) targets = _import_targets(result, relation="imports_from") - assert _make_id(str(layout)) in targets - assert _make_id(str(hydrate)) in targets + assert _make_id(_file_stem(layout)) in targets + assert _make_id(_file_stem(hydrate)) in targets def test_extract_astro_no_frontmatter_does_not_crash(tmp_path): @@ -140,4 +141,4 @@ def test_extract_astro_handles_tsconfig_path_alias(tmp_path): result = extract_astro(page) targets = _import_targets(result, relation="imports_from") - assert _make_id(str(hero)) in targets + assert _make_id(_file_stem(hero)) in targets diff --git a/tests/test_build.py b/tests/test_build.py index 065d7f414..24edaa2a4 100644 --- a/tests/test_build.py +++ b/tests/test_build.py @@ -346,6 +346,41 @@ def test_build_from_json_relative_source_file_unchanged(tmp_path): assert G.nodes["foo_bar"]["source_file"] == "src/foo.py" +def test_build_from_json_strips_legacy_language_extension_suffixes(): + extraction = { + "nodes": [ + {"id": "script_pipeline_step_py", "label": "pipeline_step.py", "file_type": "code", "source_file": "script/pipeline_step.py"}, + {"id": "consumer", "label": "consumer", "file_type": "code"}, + ], + "edges": [ + {"source": "consumer", "target": "script_pipeline_step_py", + "relation": "imports_from", "confidence": "EXTRACTED"}, + ], + } + G = build_from_json(extraction) + assert "script_pipeline_step" in G + assert "script_pipeline_step_py" not in G + assert G.has_edge("consumer", "script_pipeline_step") + + +def test_build_from_json_does_not_strip_symbol_ids_that_look_like_extensions(): + extraction = { + "nodes": [ + {"id": "format_c", "label": "format_c()", "file_type": "code", "source_file": "script/pipeline_step.py"}, + {"id": "parse_json", "label": "parse_json()", "file_type": "code", "source_file": "script/pipeline_step.py"}, + {"id": "test_py", "label": "test_py()", "file_type": "code", "source_file": "script/pipeline_step.py"}, + ], + "edges": [ + {"source": "format_c", "target": "parse_json", "relation": "calls", "confidence": "EXTRACTED"}, + {"source": "test_py", "target": "format_c", "relation": "calls", "confidence": "EXTRACTED"}, + ], + } + G = build_from_json(extraction) + assert {"format_c", "parse_json", "test_py"}.issubset(G.nodes) + assert G.has_edge("format_c", "parse_json") + assert G.has_edge("test_py", "format_c") + + def test_build_merge_prune_absolute_paths_match_relative_nodes(tmp_path): """#1007: manifest stores absolute paths, graph nodes store relative paths. prune_sources with absolute paths must still remove the right nodes and edges.""" diff --git a/tests/test_import_extension_resolution.py b/tests/test_import_extension_resolution.py index 0d1222c0a..7312a14aa 100644 --- a/tests/test_import_extension_resolution.py +++ b/tests/test_import_extension_resolution.py @@ -1,16 +1,17 @@ """Tests for #716 — TypeScript bare-path imports, Svelte 5 rune file imports (`from './foo.svelte'` for a `.svelte.ts` file), and directory/index.ts -imports must resolve to the actual file's node id, not a phantom. +imports must resolve to the actual file's parent/stem node id, not a phantom. Before #716, `_import_js` only rewrote `.js → .ts` and `.jsx → .tsx`. Every other shape (bare path, `.svelte → .svelte.ts`, `./foo` directory imports) -produced an id like `..._foo` while the real file's node id was `..._foo_ts`, +produced an id like `..._foo` while the real file's node id is `parent_foo`, so `build_from_json` dropped the edge as external. """ from pathlib import Path from graphify.extract import ( + _file_stem, _make_id, _resolve_js_module_path, extract_js, @@ -185,7 +186,7 @@ def test_bare_path_import_resolves_in_ts_file(tmp_path): importer = _write(tmp_path / "page.ts", "import type { GetNestedType } from './type-helpers'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Bare-path .ts import must resolve to target node id; " f"expected {expected}; got {_import_targets(result)}" @@ -199,7 +200,7 @@ def test_directory_import_resolves_to_index_ts(tmp_path): importer = _write(tmp_path / "page.ts", "import { enqueue } from './queue'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Directory import must resolve to ./queue/index.ts; " f"expected {expected}; got {_import_targets(result)}" @@ -216,7 +217,7 @@ def test_dot_svelte_import_resolves_to_dot_svelte_ts(tmp_path): importer = _write(tmp_path / "page.ts", "import { isMobile } from './is-mobile.svelte'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f".svelte → .svelte.ts resolution failed; " f"expected {expected}; got {_import_targets(result)}" @@ -233,7 +234,7 @@ def test_explicit_ts_import_still_works(tmp_path): importer = _write(tmp_path / "page.ts", "import { x } from './foo.ts'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Explicit .ts imports must still resolve; " f"expected {expected}; got {_import_targets(result)}" @@ -247,7 +248,7 @@ def test_explicit_svelte_import_still_works(tmp_path): importer = _write(tmp_path / "page.ts", "import Card from './Card.svelte'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Existing .svelte imports must resolve to the .svelte node, " f"not get redirected; expected {expected}; " @@ -285,7 +286,7 @@ def test_alias_import_with_bare_path_resolves(tmp_path): importer = _write(importer_dir / "page.ts", "import type { X } from '$lib/type-helpers'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Alias + bare-path resolution failed; " f"expected {expected}; got {_import_targets(result)}" @@ -304,7 +305,7 @@ def test_type_only_import_with_bare_path_resolves(tmp_path): importer = _write(tmp_path / "page.ts", "import type { GetNestedType } from './type-helpers'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Type-only import with bare path failed to resolve; " f"expected {expected}; got {_import_targets(result)}" @@ -342,7 +343,7 @@ def test_alias_directory_import_resolves_to_index_ts(tmp_path): importer = _write(src / "routes" / "page.ts", "import { enqueue } from '$lib/queue'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Alias + directory resolution failed; " f"expected {expected}; got {_import_targets(result)}" @@ -428,7 +429,7 @@ def test_end_to_end_multi_dot_import_resolves(tmp_path): importer = _write(tmp_path / "page.ts", "import { apply } from './tag-action.shared'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Multi-dot import failed end-to-end; " f"expected {expected}; got {_import_targets(result)}" @@ -448,7 +449,7 @@ def test_resolve_chain_alias_and_extension_compose(tmp_path): importer = _write(src / "routes" / "page.ts", "import { isMobile } from '$lib/hooks/is-mobile.svelte'\n") result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in _import_targets(result), ( f"Alias + .svelte→.svelte.ts chain failed to compose; " f"expected {expected}; got {_import_targets(result)}" @@ -473,7 +474,7 @@ def test_ts_dynamic_import_bare_path_resolves(tmp_path): } """) result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) targets = {str(e.get("target") or "") for e in result["edges"] if e.get("relation") in ("imports", "imports_from")} assert expected in targets, ( @@ -498,7 +499,7 @@ def test_ts_dynamic_import_alias_with_bare_path_resolves(tmp_path): } """) result = extract_js(importer) - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) targets = {str(e.get("target") or "") for e in result["edges"] if e.get("relation") in ("imports", "imports_from")} assert expected in targets, ( @@ -521,7 +522,7 @@ def test_dynamic_import_bare_path_resolves(tmp_path): result = extract_svelte(importer) dyn_targets = {str(e.get("target") or "") for e in result["edges"] if e.get("relation") == "dynamic_import"} - expected = _make_id(str(target)) + expected = _make_id(_file_stem(target)) assert expected in dyn_targets, ( f"dynamic_import of .svelte that's actually .svelte.ts must " f"resolve through the new resolver; "