Skip to content
110 changes: 110 additions & 0 deletions app/services/blast_radius_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""
Public data types for blast-radius analysis.

These are deliberately framework-agnostic dataclasses -- no FalkorDB,
no chunker, no workflow imports. Everything else in the blast-radius
feature depends on this module; this module depends on nothing.
"""

from dataclasses import dataclass
from enum import Enum
from typing import Dict, List


class EdgeConfidence(str, Enum):
"""Confidence tier for a graph edge.

EXTRACTED -- both ends syntactically certain (e.g. `from x import y; y()`).
INFERRED -- resolves through one import hop, no aliasing/re-export ambiguity.
AMBIGUOUS -- multiple candidate targets, dynamic dispatch, decorator-wrapped.

Always emit ALL candidates for an AMBIGUOUS site (recall = 1.0 design).
"""

EXTRACTED = "extracted"
INFERRED = "inferred"
AMBIGUOUS = "ambiguous"

def default_score(self) -> float:
return {
EdgeConfidence.EXTRACTED: 1.0,
EdgeConfidence.INFERRED: 0.7,
EdgeConfidence.AMBIGUOUS: 0.3,
}[self]


@dataclass
class Symbol:
"""A function, method, or class in the code graph.

`qualified_name` is the identity (e.g. `app.services.analyzer.Analyzer.review`)
and is stable across edits in a way line numbers are not.
"""

repo_id: str
path: str
qualified_name: str
kind: str # "function" | "method" | "class"
start_line: int
end_line: int
signature_hash: str
is_test: bool = False
is_entry_point: bool = False
is_hub: bool = False


@dataclass
class ChangedSymbol:
"""A symbol that was added/modified/removed by the PR diff."""

path: str
qualified_name: str
change_kind: str # "added" | "modified" | "removed"


@dataclass
class CallEdge:
"""A CALLS edge between two symbols.

`score` defaults to `confidence.default_score()` when not supplied.
"""

from_qname: str
to_qname: str
confidence: EdgeConfidence
resolution_method: str
score: float
source_sha: str


@dataclass
class ImpactedUnchangedFile:
"""A file outside the PR diff that is reachable from a changed symbol.

Carried as context only -- never used as a review-iteration target,
because it has no patch hunks or inline-comment positions.
"""

path: str
reached_via_symbol: str
hops: int
confidence: EdgeConfidence


@dataclass
class BlastRadius:
"""Output of `compute_blast_radius()`.

See spec D4 for field semantics.
"""

changed_symbols: List[ChangedSymbol]
impacted_symbols: List[str] # qualified_names
pr_files: List[str]
impacted_unchanged_files: List[ImpactedUnchangedFile]
test_set: List[str] # file paths reachable via TESTED_BY
dropped_due_to_cap: Dict[str, int] # keyed by reason, e.g. {"hop2plus_cap": 47}
edge_confidence_summary: Dict[str, int] # {"extracted": 12, ...}
risk_score: float
why_risky: List[str]
graph_available: bool
165 changes: 165 additions & 0 deletions app/services/code_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import tree_sitter_typescript as tsts
from tree_sitter import Language, Parser

from app.services.blast_radius_types import ChangedSymbol, Symbol
from app.services.vector_store import CodeChunk

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -361,6 +362,170 @@ def _infer_chunk_type(self, node_type: str) -> str:
else:
return ChunkType.UNKNOWN

# ---------------------------------------------------------------
# Symbol extraction (used by blast-radius graph indexing).
# ---------------------------------------------------------------

_PYTHON_TEST_FILE_PATTERNS = ("test_", "_test")
_PYTHON_TEST_FUNC_PREFIX = "test_"

def extract_symbols(
self,
source_code: str,
language: str,
file_path: str,
repo_id: str,
) -> list:
"""
Extract `Symbol` records from a source file.

PR 1 implements Python only. Other languages return [] and will
be added in PR 2 alongside their CallResolver implementations.
"""
if language != "python":
return []
return self._extract_python_symbols(source_code, file_path, repo_id)

def _extract_python_symbols(
self, source_code: str, file_path: str, repo_id: str
) -> list:
parser = self._get_parser("python")
if parser is None:
return []

tree = parser.parse(bytes(source_code, "utf8"))
module_qname = self._python_module_qname(file_path)
is_test_file = self._is_python_test_file(file_path)

symbols = []

def visit(node, class_chain):
if node.type == "function_definition":
name_node = node.child_by_field_name("name")
if name_node is None:
return
func_name = name_node.text.decode()
qname_parts = [module_qname, *class_chain, func_name]
qname = ".".join(p for p in qname_parts if p)
kind = "method" if class_chain else "function"
symbols.append(
Symbol(
repo_id=repo_id,
path=file_path,
qualified_name=qname,
kind=kind,
start_line=node.start_point[0] + 1,
end_line=node.end_point[0] + 1,
signature_hash=self._python_signature_hash(node),
is_test=(
is_test_file
and not class_chain
and func_name.startswith(self._PYTHON_TEST_FUNC_PREFIX)
),
)
)
# Walk inner functions/classes too.
body = node.child_by_field_name("body")
if body is not None:
for child in body.children:
visit(child, class_chain)
return

if node.type == "class_definition":
name_node = node.child_by_field_name("name")
if name_node is None:
return
cls_name = name_node.text.decode()
qname = ".".join([module_qname, *class_chain, cls_name])
symbols.append(
Symbol(
repo_id=repo_id,
path=file_path,
qualified_name=qname,
kind="class",
start_line=node.start_point[0] + 1,
end_line=node.end_point[0] + 1,
signature_hash=self._python_signature_hash(node),
)
)
body = node.child_by_field_name("body")
if body is not None:
for child in body.children:
visit(child, [*class_chain, cls_name])
return

for child in node.children:
visit(child, class_chain)

visit(tree.root_node, [])
return symbols

def extract_changed_symbols(
self,
file_path: str,
source_before: str,
source_after: str,
language: str,
) -> list:
"""
Compare before/after sources of a file and return ChangedSymbol entries.

- "added" -- in `after` only.
- "removed" -- in `before` only.
- "modified" -- in both, but signature_hash differs.
Unchanged symbols are NOT returned.
"""
before_syms = {
s.qualified_name: s
for s in self.extract_symbols(source_before, language, file_path, repo_id="")
}
after_syms = {
s.qualified_name: s
for s in self.extract_symbols(source_after, language, file_path, repo_id="")
}

changes = []
for qname, sym in after_syms.items():
if qname not in before_syms:
changes.append(
ChangedSymbol(path=file_path, qualified_name=qname, change_kind="added")
)
elif before_syms[qname].signature_hash != sym.signature_hash:
changes.append(
ChangedSymbol(path=file_path, qualified_name=qname, change_kind="modified")
)
for qname in before_syms.keys() - after_syms.keys():
changes.append(
ChangedSymbol(path=file_path, qualified_name=qname, change_kind="removed")
)
return changes

def _python_module_qname(self, file_path: str) -> str:
"""Convert `app/services/foo.py` -> `app.services.foo`."""
if file_path.endswith(".py"):
file_path = file_path[:-3]
# Drop `__init__` suffixes so the package itself is the qualifier.
if file_path.endswith("/__init__"):
file_path = file_path[: -len("/__init__")]
return file_path.replace("/", ".")

def _is_python_test_file(self, file_path: str) -> bool:
name = file_path.rsplit("/", 1)[-1]
return (
name.startswith(self._PYTHON_TEST_FILE_PATTERNS[0])
or name.removesuffix(".py").endswith(self._PYTHON_TEST_FILE_PATTERNS[1])
or "/tests/" in f"/{file_path}"
)

def _python_signature_hash(self, node) -> str:
"""Hash the symbol's full text - body included.

Body changes invalidate downstream cached call resolutions because the
callee set inside the function may have changed.
"""
text = node.text.decode("utf-8") if node.text else ""
return hashlib.md5(text.encode("utf-8")).hexdigest()

def chunk_file(
self,
file_path: str,
Expand Down
Loading