Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
268 changes: 255 additions & 13 deletions graphify/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,15 @@ def _read_tsconfig_aliases(tsconfig: Path, base_dir: Path, seen: set) -> dict[st
for alias, targets in paths.items():
if not targets:
continue
alias_prefix = alias.rstrip("/*")
target_base = targets[0].rstrip("/*")
aliases[alias_prefix] = str(base_dir / target_base)
target = targets[0]
# Preserve the wildcard in both alias and target so the resolver can
# substitute the captured segment back in. TypeScript's path mapping
# spec uses '*' as a single capture group; keeping it lets us tell
# 'wildcard alias' from 'exact alias' at resolve time.
if "*" in alias or "*" in target:
aliases[alias] = str(base_dir / target) if target else str(base_dir)
else:
aliases[alias.rstrip("/")] = str(base_dir / target.rstrip("/"))

return aliases

Expand Down Expand Up @@ -186,6 +192,12 @@ class LanguageConfig:
# Import handler: called for import nodes instead of generic handling
import_handler: Callable | None = None

# Re-export handler: called for `export ... from '...'` nodes to record
# barrel re-export relationships. Used post-extraction to redirect
# symbol-level imports through barrels to the symbol's defining file.
export_types: frozenset = frozenset()
export_handler: Callable | None = None

# Optional custom name resolver for functions (C, C++ declarator unwrapping)
resolve_function_name_fn: Callable | None = None

Expand Down Expand Up @@ -343,31 +355,151 @@ def _import_python(node, source: bytes, file_nid: str, stem: str, edges: list, s
})


def _resolved_path_to_nid(resolved: Path) -> str:
"""Compute a node id from a resolved import target.

The file extractor uses ``_make_id(str(path))`` against whatever Path
representation it was handed (usually project-relative). Alias resolution
builds an absolute path against the tsconfig's location, so we relativize
to cwd before computing the id — otherwise the edge target id never
matches the file node id and every alias-resolved import becomes a
phantom.
"""
try:
rel = resolved.relative_to(Path.cwd())
return _make_id(str(rel))
except ValueError:
return _make_id(str(resolved))


def _alias_specificity(alias: str) -> "tuple[int, int]":
"""Sort key so the most specific alias matches first.

TypeScript's path-mapping spec is "longest matching prefix wins", with
exact (non-wildcard) entries winning over wildcard entries when both
would match. Returning a tuple lets us pass this straight to ``sorted``:
exact aliases get 0, wildcards 1, then negative-length so longer prefixes
sort first.
"""
if "*" in alias:
return (1, -len(alias.split("*", 1)[0]))
return (0, -len(alias))


def _resolve_js_import_target(raw: str, str_path: str) -> "tuple[str, Path | None] | None":
"""Resolve a JS/TS import path string to (target_nid, resolved_path).

Handles relative paths, tsconfig path aliases, and bare/scoped imports.
Returns None if `raw` is empty.
Handles relative paths, tsconfig path aliases (exact and wildcard), and
bare/scoped imports. Returns None if `raw` is empty.
"""
if not raw:
return None
if raw.startswith("."):
resolved = Path(os.path.normpath(Path(str_path).parent / raw))
resolved = _resolve_js_module_path(resolved)
return _make_id(str(resolved)), resolved
return _resolved_path_to_nid(resolved), resolved
aliases = _load_tsconfig_aliases(Path(str_path).parent)
for alias_prefix, alias_base in aliases.items():
if raw == alias_prefix or raw.startswith(alias_prefix + "/"):
rest = raw[len(alias_prefix):].lstrip("/")
resolved_alias = Path(os.path.normpath(Path(alias_base) / rest))
resolved_alias = _resolve_js_module_path(resolved_alias)
return _make_id(str(resolved_alias)), resolved_alias
for alias_pattern, target_pattern in sorted(aliases.items(), key=lambda kv: _alias_specificity(kv[0])):
if "*" in alias_pattern:
star_idx = alias_pattern.index("*")
prefix = alias_pattern[:star_idx]
suffix = alias_pattern[star_idx + 1:]
if (
raw.startswith(prefix)
and raw.endswith(suffix)
and len(raw) > len(prefix) + len(suffix)
):
captured = raw[len(prefix): len(raw) - len(suffix)] if suffix else raw[len(prefix):]
resolved_str = target_pattern.replace("*", captured)
resolved_alias = Path(os.path.normpath(resolved_str))
resolved_alias = _resolve_js_module_path(resolved_alias)
return _resolved_path_to_nid(resolved_alias), resolved_alias
else:
if raw == alias_pattern or raw.startswith(alias_pattern + "/"):
rest = raw[len(alias_pattern):].lstrip("/")
resolved_alias = Path(os.path.normpath(Path(target_pattern) / rest))
resolved_alias = _resolve_js_module_path(resolved_alias)
return _resolved_path_to_nid(resolved_alias), resolved_alias
module_name = raw.split("/")[-1]
if not module_name:
return None
return _make_id(module_name), None


def _export_js(node, source: bytes, file_nid: str, stem: str, str_path: str, reexports: list) -> None:
"""Record `export ... from '...'` re-exports for post-extraction barrel resolution.

Handles three forms:
export { X, Y as Z } from './mod' -> named re-exports
export * from './mod' -> star re-export
export * as ns from './mod' -> namespace re-export (treated like star)

Plain local exports (without a `from` clause) are not re-exports — they
just declare what this file exposes — so they're skipped.
"""
from_string_node = None
export_clause_node = None
namespace_export = False
star_export = False

for child in node.children:
ct = child.type
if ct == "string":
from_string_node = child
elif ct == "export_clause":
export_clause_node = child
elif ct == "namespace_export":
namespace_export = True
elif ct in ("*", "asterisk"):
star_export = True

if from_string_node is None:
return

raw_source = _read_text(from_string_node, source).strip("'\"` ")
if not raw_source:
return

resolved = _resolve_js_import_target(raw_source, str_path)
if resolved is None:
return
_, resolved_path = resolved
if resolved_path is None:
return

if star_export or namespace_export:
reexports.append({
"from_path": str_path,
"to_path": str(resolved_path),
"kind": "star",
"exposed": None,
"original": None,
})
return

if export_clause_node is None:
return

for spec in export_clause_node.children:
if spec.type != "export_specifier":
continue
name_node = spec.child_by_field_name("name")
alias_node = spec.child_by_field_name("alias")
if name_node is None:
continue
original = _read_text(name_node, source).strip()
exposed = _read_text(alias_node, source).strip() if alias_node else original
if not original or not exposed:
continue
reexports.append({
"from_path": str_path,
"to_path": str(resolved_path),
"kind": "named",
"exposed": exposed,
"original": original,
})


def _import_js(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str) -> None:
resolved_path: "Path | None" = None
for child in node.children:
Expand Down Expand Up @@ -395,6 +527,7 @@ def _import_js(node, source: bytes, file_nid: str, stem: str, edges: list, str_p
# defining the symbol, so these edges wire importers directly to existing symbol nodes.
if resolved_path is not None:
target_stem = _file_stem(resolved_path)
target_path_str = str(resolved_path)
line = node.start_point[0] + 1
for child in node.children:
if child.type == "import_clause":
Expand All @@ -414,6 +547,13 @@ def _import_js(node, source: bytes, file_nid: str, stem: str, edges: list, str_p
"source_file": str_path,
"source_location": f"L{line}",
"weight": 1.0,
# Metadata for post-extraction re-export resolution.
# If `target_path_str` is a barrel re-exporting `sym`,
# the post-process pass follows the chain to find the
# symbol's actual defining file and rewrites the
# target id. Stripped before graph.json is written.
"_reexport_target_path": target_path_str,
"_reexport_symbol": sym,
})


Expand Down Expand Up @@ -900,6 +1040,8 @@ def _swift_extra_walk(node, source: bytes, file_nid: str, stem: str, str_path: s
call_accessor_field="property",
function_boundary_types=frozenset({"function_declaration", "arrow_function", "method_definition"}),
import_handler=_import_js,
export_types=frozenset({"export_statement"}),
export_handler=_export_js,
)

_TS_CONFIG = LanguageConfig(
Expand All @@ -919,6 +1061,8 @@ def _swift_extra_walk(node, source: bytes, file_nid: str, stem: str, str_path: s
call_accessor_field="property",
function_boundary_types=frozenset({"function_declaration", "arrow_function", "method_definition"}),
import_handler=_import_js,
export_types=frozenset({"export_statement"}),
export_handler=_export_js,
)

# .tsx files must use the TSX grammar (JSX-aware), not the plain TypeScript grammar.
Expand Down Expand Up @@ -1257,6 +1401,15 @@ def walk(node, parent_class_nid: str | None = None) -> None:
config.import_handler(node, source, file_nid, stem, edges, str_path)
return

# Re-export types (`export ... from '...'`). Recorded for the
# post-extraction barrel resolution pass in extract().
if t in config.export_types:
if config.export_handler:
config.export_handler(node, source, file_nid, stem, str_path, reexports)
# Don't `return` here — children of export_statement may include
# class/function declarations (e.g. `export class Foo {}`) that
# we still want to walk normally.

# Class types
if t in config.class_types:
# Resolve class name
Expand Down Expand Up @@ -1522,6 +1675,13 @@ def _emit_java_parent(base_name: str, rel: str, at_line: int) -> None:
for child in node.children:
walk(child, parent_class_nid=None)

# `reexports` is filled by config.export_handler during walk, so it must
# exist in the enclosing scope before walk runs. Other accumulators
# (raw_calls, etc.) are referenced only on conditional paths and survive
# being initialised later, but the export handler fires for every file
# with any `export ... from '...'` statement.
reexports: list[dict] = []

walk(root)

# ── Call-graph pass ───────────────────────────────────────────────────────
Expand Down Expand Up @@ -1858,7 +2018,7 @@ def walk_calls(node, caller_nid: str) -> None:
if src in valid_ids and (tgt in valid_ids or edge["relation"] in ("imports", "imports_from")):
clean_edges.append(edge)

return {"nodes": nodes, "edges": clean_edges, "raw_calls": raw_calls}
return {"nodes": nodes, "edges": clean_edges, "raw_calls": raw_calls, "reexports": reexports}


# ── Python rationale extraction ───────────────────────────────────────────────
Expand Down Expand Up @@ -5902,6 +6062,88 @@ def extract(
if e.get("target") in id_remap:
e["target"] = id_remap[e["target"]]

# ── Barrel re-export resolution for JS/TS symbol-level imports ────────────
# `import { Foo } from '@pillar/topic'` resolves the file-level edge to the
# topic's barrel index.ts, but the symbol-level edge ends up targeting a
# synthetic id like `<barrel_stem>_<symbol>` that nothing actually defines —
# build_from_json drops it and the consumer ↔ definer coupling is lost.
# Follow the `export { Foo } from './foo'` / `export * from './foo'` chain
# collected per-file at extract time, then rewrite each symbol-level edge
# to point at the symbol's defining file.
named_reexports: dict[tuple[str, str], tuple[str, str]] = {}
star_reexports: dict[str, list[str]] = {}
for result in per_file:
if not result:
continue
for rx in result.get("reexports", ()):
kind = rx.get("kind")
frm = rx.get("from_path")
to = rx.get("to_path")
if not frm or not to:
continue
if kind == "named":
exposed = rx.get("exposed")
original = rx.get("original")
if exposed and original:
named_reexports[(frm, exposed)] = (to, original)
elif kind == "star":
star_reexports.setdefault(frm, []).append(to)

def _resolve_reexport_chain(initial_path: str, initial_symbol: str, max_depth: int = 16) -> "tuple[str, str]":
visited: set[tuple[str, str]] = set()
cur_path, cur_symbol = initial_path, initial_symbol
for _ in range(max_depth):
key = (cur_path, cur_symbol)
if key in visited:
break
visited.add(key)
nxt = named_reexports.get(key)
if nxt is not None:
cur_path, cur_symbol = nxt
continue
stars = star_reexports.get(cur_path)
if stars:
advanced = False
for st in stars:
if (st, cur_symbol) in named_reexports or st in star_reexports:
cur_path = st
advanced = True
break
if advanced:
continue
break
return cur_path, cur_symbol

if named_reexports or star_reexports:
existing_ids = {n["id"] for n in all_nodes}
reexport_rewrites = 0
for e in all_edges:
tp = e.pop("_reexport_target_path", None)
sym = e.pop("_reexport_symbol", None)
if tp is None or sym is None:
continue
final_path, final_symbol = _resolve_reexport_chain(tp, sym)
if (final_path, final_symbol) == (tp, sym):
continue
new_target = _make_id(_file_stem(Path(final_path)), final_symbol)
if new_target in existing_ids:
e["target"] = new_target
reexport_rewrites += 1
# Strip metadata from any remaining edges (e.g. when no chain applied).
for e in all_edges:
e.pop("_reexport_target_path", None)
e.pop("_reexport_symbol", None)
if reexport_rewrites:
import logging
logging.getLogger(__name__).info(
"Resolved %d symbol-level imports through barrel re-exports",
reexport_rewrites,
)
else:
for e in all_edges:
e.pop("_reexport_target_path", None)
e.pop("_reexport_symbol", None)

# Add cross-file class-level edges (Python only - uses Python parser internally)
py_paths = [p for p in paths if p.suffix == ".py"]
if py_paths:
Expand Down