diff --git a/graphify/__main__.py b/graphify/__main__.py index a2678655e..1c4f12cce 100644 --- a/graphify/__main__.py +++ b/graphify/__main__.py @@ -2562,7 +2562,7 @@ def _load_graph(p: str): elif cmd == "export": subcmd = sys.argv[2] if len(sys.argv) > 2 else "" - if subcmd not in ("html", "callflow-html", "obsidian", "wiki", "svg", "graphml", "neo4j"): + if subcmd not in ("html", "callflow-html", "obsidian", "wiki", "svg", "graphml", "neo4j", "memory-index"): print("Usage: graphify export ", file=sys.stderr) print(" html [--graph PATH] [--labels PATH] [--node-limit N] [--no-viz]", file=sys.stderr) print(" callflow-html [GRAPH|DIR] [--graph PATH] [--labels PATH] [--report PATH] [--sections PATH] [--output HTML]", file=sys.stderr) @@ -2573,6 +2573,8 @@ def _load_graph(p: str): print(" graphml [--graph PATH]", file=sys.stderr) print(" neo4j [--graph PATH] [--push URI] [--user U] [--password P]", file=sys.stderr) print(" (or set NEO4J_PASSWORD instead of --password to keep it off argv)", file=sys.stderr) + print(" memory-index [--graph PATH] [--output HTML] [--next-steps STEPS] [--project NAME]", file=sys.stderr) + sys.exit(1) # Parse shared args @@ -2688,7 +2690,20 @@ def _load_graph(p: str): graph=graph_path, report=report_path, labels=labels_path, - sections=sections_path, + if subcmd == "memory-index": + from graphify.memory_index import write_memory_index as _write_memory_index + out = _write_memory_index( + graph=graph_path, + report=report_path, + output=callflow_output, + next_steps=memory_next_steps, + project_name=memory_project, + ) + print(f"✓ memory-index written → {out}") + sys.exit(0) + +=sections_path, + output=callflow_output, lang=callflow_lang, max_sections=callflow_max_sections, diff --git a/graphify/memory_index.py b/graphify/memory_index.py new file mode 100644 index 000000000..225acab63 --- /dev/null +++ b/graphify/memory_index.py @@ -0,0 +1,576 @@ +""" +Memory index exporter — optimized for LLM context retention. + +Generates three files that allow Claude/LLMs to resume work without re-reading +the entire codebase: +- memory_index.json: Compact graph of key modules and dependencies +- MEMORY_REPORT.md: Markdown summary for quick onboarding +- memory_index.html: Interactive filterable UI +""" + +import json +from collections import Counter +from datetime import datetime +from pathlib import Path +from typing import List, Optional, Union + +from graphify.security import sanitize_label + + +def write_memory_index( + graph: Optional[Union[str, Path]] = None, + report: Optional[Union[str, Path]] = None, + output: Optional[Union[str, Path]] = None, + *, + next_steps: Optional[List[str]] = None, + project_name: Optional[str] = None, +) -> Path: + """Generate memory-index files optimized for LLM context retention. + + Reads an existing graph.json and generates: + - memory_index.json: compact index of key modules and critical edges + - MEMORY_REPORT.md: markdown summary with next steps + - memory_index.html: interactive filterable table + + Args: + graph: Path to graph.json (e.g., graphify-out/graph.json) + report: Path to GRAPH_REPORT.md (optional, for context) + output: Output HTML file path (directory inferred from extension) + next_steps: List of next action items to include in report + project_name: Project name override (defaults to directory name) + + Returns: + Path to the generated HTML file + """ + if graph is None: + raise ValueError("--graph is required for memory-index export") + + graph_path = Path(graph) + if not graph_path.exists(): + raise FileNotFoundError(f"Graph file not found: {graph_path}") + + output_path = Path(output) if output else graph_path.parent / "memory_index.html" + output_dir = output_path.parent if output_path.suffix else output_path + + # Load graph data + graph_data = json.loads(graph_path.read_text()) + nodes = graph_data.get("nodes", []) + edges = graph_data.get("links", []) + + # Infer project name + if project_name is None: + project_name = graph_path.parent.parent.name or "Unknown Project" + + # Extract key modules (top ~15% by degree) + key_modules, key_ids, degrees = _extract_key_modules(nodes, edges) + + # Extract clusters + clusters = _extract_clusters(nodes, key_modules) + + # Extract critical edges (EXTRACTED confidence only) + critical_edges = _extract_critical_edges(edges, key_ids) + + # Calculate token estimate (rough: 1 token per 4 characters) + token_estimate = sum( + len(sanitize_label(m.get("label", m["id"]))) // 4 + for m in key_modules + ) + sum( + len(e.get("relation", "")) // 4 + for e in critical_edges + ) + + # Build memory index dict + memory_index = { + "project": project_name, + "generated_at": datetime.now().isoformat(), + "key_modules": key_modules, + "clusters": clusters, + "critical_edges": critical_edges, + "next_steps": next_steps or [], + "token_estimate": token_estimate, + } + + # Write files + _write_memory_json(output_dir, memory_index) + _write_memory_report(output_dir, memory_index, graph_data, clusters, degrees) + _write_memory_html(output_dir, memory_index, clusters) + + html_path = output_dir / "memory_index.html" + return html_path + + +def _extract_key_modules( + nodes: list[dict], edges: list[dict] +) -> tuple[list[dict], set[str], dict]: + """Extract top 15% of nodes by degree. + + Returns: + (key_modules list, key_ids set, degrees dict) + """ + degrees = Counter() + for edge in edges: + src = edge.get("source") + tgt = edge.get("target") + if src: + degrees[src] += 1 + if tgt: + degrees[tgt] += 1 + + if not degrees: + return [], set(), {} + + # Top 15% by degree + threshold = sorted(set(degrees.values()), reverse=True)[ + max(0, len(set(degrees.values())) // 7) - 1 + ] if len(set(degrees.values())) > 1 else min(degrees.values()) + + key_ids = {node_id for node_id, deg in degrees.items() if deg >= threshold} + + key_modules = [] + for node in nodes: + node_id = node.get("id") + if node_id in key_ids: + label = sanitize_label(node.get("label", node_id)) + source_file = node.get("source_file", "") + degree = degrees.get(node_id, 0) + + key_modules.append({ + "id": node_id, + "label": label, + "file": source_file, + "degree": degree, + "community": node.get("community"), + }) + + # Sort by degree descending + key_modules.sort(key=lambda x: x["degree"], reverse=True) + return key_modules, key_ids, degrees + + +def _extract_clusters( + nodes: list[dict], key_modules: list[dict] +) -> list[dict]: + """Extract clusters from nodes with community attribute.""" + community_map = {} + + for node in nodes: + comm_id = node.get("community") + if comm_id is None: + continue + + label = sanitize_label(node.get("label", node.get("id", "Unknown"))) + if comm_id not in community_map: + community_map[comm_id] = { + "id": comm_id, + "label": f"Community {comm_id}", + "members": [], + } + community_map[comm_id]["members"].append(label) + + # Sort clusters by size + clusters = sorted(community_map.values(), key=lambda c: len(c["members"]), reverse=True) + return clusters + + +def _extract_critical_edges(edges: list[dict], key_ids: set) -> list[dict]: + """Extract EXTRACTED-confidence edges between key modules.""" + critical = [] + + for edge in edges: + src = edge.get("source") + tgt = edge.get("target") + confidence = edge.get("confidence", "EXTRACTED") + + # Only include EXTRACTED edges between key modules + if confidence == "EXTRACTED" and src in key_ids and tgt in key_ids: + critical.append({ + "source": src, + "target": tgt, + "relation": edge.get("relation", "unknown"), + "confidence": confidence, + }) + + return critical + + +def _write_memory_json(output_dir: Path, memory_index: dict) -> None: + """Write memory_index.json.""" + output_dir.mkdir(parents=True, exist_ok=True) + json_path = output_dir / "memory_index.json" + json_path.write_text(json.dumps(memory_index, indent=2)) + + +def _write_memory_report( + output_dir: Path, + memory_index: dict, + graph_data: dict, + clusters: list[dict], + degrees: dict, +) -> None: + """Write MEMORY_REPORT.md.""" + output_dir.mkdir(parents=True, exist_ok=True) + report_path = output_dir / "MEMORY_REPORT.md" + + project = memory_index["project"] + generated = memory_index["generated_at"] + key_modules = memory_index["key_modules"] + critical_edges = memory_index["critical_edges"] + next_steps = memory_index["next_steps"] + + lines = [ + f"# Memory Index — {project}", + f"", + f"**Generated**: {generated}", + f"", + f"## Quick Start", + f"", + f"This memory index summarizes the key modules and dependencies of {project}.", + f"Use it to quickly understand the architecture without reading the full codebase.", + f"", + f"**Token estimate**: ~{memory_index['token_estimate']} tokens (vs ~50k for full graph)", + f"", + ] + + # Key modules table + if key_modules: + lines.extend([ + f"## Key Modules (top by connectivity)", + f"", + f"| Module | File | Connections | Community |", + f"|--------|------|-------------|-----------|", + ]) + for mod in key_modules[:20]: # Limit to top 20 + label = mod["label"] + file_name = Path(mod["file"]).name if mod["file"] else "—" + degree = mod["degree"] + comm = mod.get("community", "—") + lines.append(f"| `{label}` | {file_name} | {degree} | {comm} |") + lines.append("") + + # Clusters + if clusters: + lines.extend([ + f"## Architecture Clusters", + f"", + ]) + for cluster in clusters[:5]: # Top 5 clusters + cid = cluster["id"] + members = ", ".join(cluster["members"][:5]) + if len(cluster["members"]) > 5: + members += f", +{len(cluster['members']) - 5} more" + lines.append(f"**Community {cid}**: {members}") + lines.append("") + + # Critical edges + if critical_edges: + lines.extend([ + f"## Critical Dependencies", + f"", + ]) + seen = set() + for edge in critical_edges[:10]: # Top 10 edges + key = (edge["source"], edge["target"]) + if key not in seen: + src = edge["source"] + tgt = edge["target"] + rel = edge["relation"] + lines.append(f"- `{src}` **{rel}** `{tgt}`") + seen.add(key) + lines.append("") + + # Next steps + if next_steps: + lines.extend([ + f"## Next Steps", + f"", + ]) + for i, step in enumerate(next_steps, 1): + lines.append(f"{i}. {step}") + lines.append("") + + # Footer + lines.extend([ + f"## Query the Full Graph", + f"", + f"For deeper exploration, use:", + f"```bash", + f"graphify query --graph graphify-out/graph.json 'Which modules handle facturas?'", + f"```", + f"", + ]) + + report_path.write_text("\n".join(lines)) + + +def _write_memory_html( + output_dir: Path, + memory_index: dict, + clusters: list[dict], +) -> None: + """Write memory_index.html — lightweight interactive table.""" + output_dir.mkdir(parents=True, exist_ok=True) + html_path = output_dir / "memory_index.html" + + key_modules = memory_index["key_modules"] + project = memory_index["project"] + + # Build module rows + module_rows = [] + for mod in key_modules: + row = { + "id": mod["id"], + "label": mod["label"], + "file": mod["file"], + "degree": mod["degree"], + "community": mod.get("community", "—"), + } + module_rows.append(row) + + # JSON-escape for embedding in HTML + modules_json = json.dumps(module_rows).replace(" + + + + + Memory Index — {sanitize_label(project)} + + + +
+

Memory Index

+

{sanitize_label(project)} • {memory_index['generated_at'].split('T')[0]}

+ +
+ Lightweight index of key modules and dependencies. Use to understand architecture + without reading the full codebase (~{memory_index['token_estimate']} tokens). +
+ +
+
+
{len(key_modules)}
+
Key Modules
+
+
+
{len(clusters)}
+
Clusters
+
+
+
{len(memory_index['critical_edges'])}
+
Dependencies
+
+
+
{memory_index['token_estimate']}
+
Est. Tokens
+
+
+ + + + + + + + + + + + + + +
ModuleFileConnectionsCommunity
+
+ + + + +""" + + html_path.write_text(html_content) diff --git a/graphify/tests/test_memory_index.py b/graphify/tests/test_memory_index.py new file mode 100644 index 000000000..04265a20b --- /dev/null +++ b/graphify/tests/test_memory_index.py @@ -0,0 +1,202 @@ +"""Tests for memory_index exporter.""" + +import json +from pathlib import Path + +import pytest + +from graphify.memory_index import write_memory_index + + +@pytest.fixture +def sample_graph_json(tmp_path: Path) -> Path: + """Create a minimal graph.json fixture for testing.""" + data = { + "nodes": [ + {"id": "A", "label": "ModuleA", "community": 0, "source_file": "src/a.py"}, + {"id": "B", "label": "ModuleB", "community": 0, "source_file": "src/b.py"}, + {"id": "C", "label": "ModuleC", "community": 1, "source_file": "src/c.py"}, + {"id": "D", "label": "ModuleD", "community": 1, "source_file": "src/d.py"}, + ], + "links": [ + {"source": "A", "target": "B", "relation": "calls", "confidence": "EXTRACTED"}, + {"source": "B", "target": "C", "relation": "imports", "confidence": "EXTRACTED"}, + {"source": "C", "target": "D", "relation": "uses", "confidence": "INFERRED"}, + {"source": "D", "target": "A", "relation": "references", "confidence": "AMBIGUOUS"}, + ], + "hyperedges": [], + } + graph_file = tmp_path / "graph.json" + graph_file.write_text(json.dumps(data)) + return graph_file + + +def test_memory_index_creates_files(sample_graph_json: Path, tmp_path: Path): + """Test that write_memory_index creates all three output files.""" + output_html = tmp_path / "output" / "memory_index.html" + write_memory_index(graph=sample_graph_json, output=output_html) + + output_dir = output_html.parent + assert (output_dir / "memory_index.json").exists() + assert (output_dir / "MEMORY_REPORT.md").exists() + assert output_html.exists() + + +def test_memory_index_json_schema(sample_graph_json: Path, tmp_path: Path): + """Test that memory_index.json has correct schema.""" + output_html = tmp_path / "output" / "memory_index.html" + write_memory_index(graph=sample_graph_json, output=output_html) + + json_file = output_html.parent / "memory_index.json" + data = json.loads(json_file.read_text()) + + assert "project" in data + assert "generated_at" in data + assert "key_modules" in data + assert "clusters" in data + assert "critical_edges" in data + assert "next_steps" in data + assert "token_estimate" in data + assert isinstance(data["key_modules"], list) + assert isinstance(data["clusters"], list) + assert isinstance(data["critical_edges"], list) + + +def test_memory_index_key_modules(sample_graph_json: Path, tmp_path: Path): + """Test that key modules are extracted correctly.""" + output_html = tmp_path / "output" / "memory_index.html" + write_memory_index(graph=sample_graph_json, output=output_html) + + json_file = output_html.parent / "memory_index.json" + data = json.loads(json_file.read_text()) + modules = data["key_modules"] + + # Should extract some modules + assert len(modules) > 0 + + # Each module should have required fields + for mod in modules: + assert "id" in mod + assert "label" in mod + assert "file" in mod + assert "degree" in mod + assert "community" in mod + + +def test_memory_index_critical_edges(sample_graph_json: Path, tmp_path: Path): + """Test that only EXTRACTED edges are included in critical_edges.""" + output_html = tmp_path / "output" / "memory_index.html" + write_memory_index(graph=sample_graph_json, output=output_html) + + json_file = output_html.parent / "memory_index.json" + data = json.loads(json_file.read_text()) + edges = data["critical_edges"] + + # All critical edges should have EXTRACTED confidence + for edge in edges: + assert edge["confidence"] == "EXTRACTED" + assert "source" in edge + assert "target" in edge + assert "relation" in edge + + +def test_memory_index_with_next_steps(sample_graph_json: Path, tmp_path: Path): + """Test that next_steps are included in JSON and report.""" + steps = ["Cargar Enero 2026", "Exportar a PDF", "Alertas por email"] + output_html = tmp_path / "output" / "memory_index.html" + write_memory_index( + graph=sample_graph_json, + output=output_html, + next_steps=steps, + ) + + # Check JSON + json_file = output_html.parent / "memory_index.json" + data = json.loads(json_file.read_text()) + assert data["next_steps"] == steps + + # Check report + report_file = output_html.parent / "MEMORY_REPORT.md" + report = report_file.read_text() + for step in steps: + assert step in report + + +def test_memory_index_with_project_name(sample_graph_json: Path, tmp_path: Path): + """Test that project_name is included in output.""" + output_html = tmp_path / "output" / "memory_index.html" + project = "MyProject" + write_memory_index( + graph=sample_graph_json, + output=output_html, + project_name=project, + ) + + json_file = output_html.parent / "memory_index.json" + data = json.loads(json_file.read_text()) + assert data["project"] == project + + report_file = output_html.parent / "MEMORY_REPORT.md" + report = report_file.read_text() + assert project in report + + +def test_memory_report_sections(sample_graph_json: Path, tmp_path: Path): + """Test that MEMORY_REPORT.md contains required sections.""" + output_html = tmp_path / "output" / "memory_index.html" + write_memory_index(graph=sample_graph_json, output=output_html) + + report_file = output_html.parent / "MEMORY_REPORT.md" + report = report_file.read_text() + + # Check required sections + assert "# Memory Index" in report + assert "## Quick Start" in report + assert "## Key Modules" in report or "## Architecture Clusters" in report + assert "## Query the Full Graph" in report + + +def test_memory_index_html_has_search(sample_graph_json: Path, tmp_path: Path): + """Test that HTML includes search functionality.""" + output_html = tmp_path / "output" / "memory_index.html" + write_memory_index(graph=sample_graph_json, output=output_html) + + html = output_html.read_text() + + # Check for search input + assert 'id="searchInput"' in html + assert "Search modules" in html + + # Check for table structure + assert '' in html + assert ' 0 + assert data["token_estimate"] < 100000 # Sanity check