Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ These are only needed for **headless / CI extraction** (`graphify extract`). Whe
| `GEMINI_API_KEY` or `GOOGLE_API_KEY` | Google Gemini backend | `--backend gemini` |
| `OPENAI_API_KEY` | OpenAI or OpenAI-compatible APIs | `--backend openai` |
| `DEEPSEEK_API_KEY` | DeepSeek backend | `--backend deepseek` |
| `OPENROUTER_API_KEY` | OpenRouter DeepSeek/Kimi backends | default for extraction via `openrouter-deepseek`; explicit `--backend openrouter-deepseek` or `--backend openrouter-kimi` |
| `MOONSHOT_API_KEY` | Kimi Code backend | `--backend kimi` |
| `OLLAMA_BASE_URL` | Ollama local inference URL | `--backend ollama` (default: `http://localhost:11434`) |
| `OLLAMA_MODEL` | Ollama model name | `--backend ollama` (default: auto-detect) |
Expand All @@ -343,7 +344,7 @@ These are only needed for **headless / CI extraction** (`graphify extract`). Whe

- **Code files** — processed locally via tree-sitter. Nothing leaves your machine.
- **Video / audio** — transcribed locally with faster-whisper. Nothing leaves your machine.
- **Docs, PDFs, images** — sent to your AI assistant for semantic extraction (via the `/graphify` skill, using whatever model your IDE session runs). Headless `graphify extract` requires `GEMINI_API_KEY` / `GOOGLE_API_KEY` (Gemini), `MOONSHOT_API_KEY` (Kimi), `ANTHROPIC_API_KEY` (Claude), `OPENAI_API_KEY` (OpenAI), `DEEPSEEK_API_KEY` (DeepSeek), a running Ollama instance (`OLLAMA_BASE_URL`), AWS credentials via the standard provider chain (Bedrock - no API key needed, uses IAM), or the `claude` CLI binary (Claude Code - no API key needed, uses your Claude subscription). The `--dedup-llm` flag uses the same key.
- **Docs, PDFs, images** — sent to your AI assistant for semantic extraction (via the `/graphify` skill, using whatever model your IDE session runs). Headless `graphify extract` defaults to OpenRouter DeepSeek when `OPENROUTER_API_KEY` is set, or can use `DEEPSEEK_API_KEY` (DeepSeek), `GEMINI_API_KEY` / `GOOGLE_API_KEY` (Gemini), `MOONSHOT_API_KEY` (Kimi), `ANTHROPIC_API_KEY` (Claude), `OPENAI_API_KEY` (OpenAI), a running Ollama instance (`OLLAMA_BASE_URL`), AWS credentials via the standard provider chain (Bedrock - no API key needed, uses IAM), or the `claude` CLI binary (Claude Code - no API key needed, uses your Claude subscription). The `--dedup-llm` flag uses the same key.
- No telemetry, no usage tracking, no analytics.

---
Expand Down Expand Up @@ -453,7 +454,8 @@ graphify kiro install / uninstall
graphify antigravity install / uninstall

graphify extract ./docs # headless LLM extraction for CI (no IDE needed)
graphify extract ./docs --backend gemini # explicit backend: gemini, kimi, claude, openai, deepseek, ollama, bedrock, or claude-cli
graphify extract ./docs --backend openrouter-deepseek # explicit OpenRouter DeepSeek backend (default when OPENROUTER_API_KEY is set)
graphify extract ./docs --backend gemini # explicit backend: openrouter-deepseek, openrouter-kimi, deepseek, gemini, kimi, claude, openai, ollama, bedrock, or claude-cli
graphify extract ./docs --backend gemini --model gemini-3.1-pro-preview
graphify extract ./docs --backend ollama # local Ollama (set OLLAMA_BASE_URL / OLLAMA_MODEL) - no API key needed for loopback
GRAPHIFY_OLLAMA_NUM_CTX=32768 graphify extract ./docs --backend ollama # override KV-cache window (auto-sized by default)
Expand All @@ -470,6 +472,7 @@ graphify extract ./docs --force # overwrite graph.json even if ne
graphify extract ./docs --dedup-llm # LLM tiebreaker for ambiguous entity pairs (uses same API key)
graphify extract ./docs --global --as myrepo # extract and register into the cross-project global graph
GRAPHIFY_MAX_OUTPUT_TOKENS=32768 graphify extract ./docs --backend claude # raise output cap for dense corpora
graphify quality graphify-out/graph.json # schema-quality gate for generated graph JSON

graphify export callflow-html # graphify-out/<project>-callflow.html
graphify export callflow-html --max-sections 8 # cap generated architecture sections
Expand Down
47 changes: 39 additions & 8 deletions graphify/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1241,6 +1241,8 @@ def main() -> None:
print(" --context C explicit edge-context filter (repeatable)")
print(" --budget N cap output at N tokens (default 2000)")
print(" --graph <path> path to graph.json (default graphify-out/graph.json)")
print(" quality [graph.json] inspect graph.json schema quality")
print(" --json emit machine-readable quality report")
print(" save-result save a Q&A result to graphify-out/memory/ for graph feedback loop")
print(" --question Q the question asked")
print(" --answer A the answer to save")
Expand All @@ -1256,7 +1258,7 @@ def main() -> None:
print(" --top-k-edges N per-symbol outbound edges in inspector (default 12)")
print(" --label NAME project label in header")
print(" extract <path> headless full extraction (AST + semantic LLM) for CI/scripts")
print(" --backend B gemini|kimi|claude|openai|deepseek|ollama (default: whichever API key is set)")
print(" --backend B openrouter-deepseek|openrouter-kimi|deepseek|gemini|kimi|claude|openai|ollama (default: OpenRouter DeepSeek when OPENROUTER_API_KEY is set)")
print(" --model M override backend default model")
print(" --max-workers N AST extraction subprocess count (default: cpu_count)")
print(" --token-budget N per-chunk token cap for semantic extraction (default: 60000)")
Expand Down Expand Up @@ -1576,6 +1578,26 @@ def main() -> None:
source_nodes=opts.nodes or None,
)
print(f"Saved to {out}")
elif cmd == "quality":
from graphify.quality import format_report, inspect_graph
graph_path = Path(_default_graph_path())
emit_json = False
for arg in sys.argv[2:]:
if arg == "--json":
emit_json = True
else:
graph_path = Path(arg)
try:
report = inspect_graph(graph_path)
except Exception as exc:
print(f"error: could not inspect graph quality: {exc}", file=sys.stderr)
sys.exit(1)
if emit_json:
print(json.dumps(report, indent=2))
else:
print(format_report(report))
if report["status"] != "pass":
sys.exit(1)
elif cmd == "path":
if len(sys.argv) < 4:
print("Usage: graphify path \"<source>\" \"<target>\" [--graph path]", file=sys.stderr)
Expand Down Expand Up @@ -2402,7 +2424,7 @@ def _load_graph(p: str):
# has an API key set.
if len(sys.argv) < 3:
print(
"Usage: graphify extract <path> [--backend gemini|kimi|claude|openai|deepseek|ollama] "
"Usage: graphify extract <path> [--backend openrouter-deepseek|openrouter-kimi|deepseek|gemini|kimi|claude|openai|ollama] "
"[--model M] [--out DIR] [--google-workspace] [--no-cluster] "
"[--max-workers N] [--token-budget N] [--max-concurrency N] "
"[--api-timeout S]",
Expand Down Expand Up @@ -2525,14 +2547,16 @@ def _parse_float(name: str, raw: str) -> float:
extract_corpus_parallel as _extract_corpus_parallel,
_format_backend_env_keys,
_get_backend_api_key,
_sanitize_extraction_result,
)
if backend is None:
backend = _detect_backend()
if backend is None:
print(
"error: no LLM API key found. Set GEMINI_API_KEY or GOOGLE_API_KEY "
"(gemini), MOONSHOT_API_KEY (kimi), ANTHROPIC_API_KEY (claude), "
"OPENAI_API_KEY (openai), DEEPSEEK_API_KEY (deepseek), "
"error: no LLM API key found. Set OPENROUTER_API_KEY "
"(default openrouter-deepseek), DEEPSEEK_API_KEY (deepseek), "
"GEMINI_API_KEY or GOOGLE_API_KEY (gemini), MOONSHOT_API_KEY (kimi), "
"ANTHROPIC_API_KEY (claude), OPENAI_API_KEY (openai), "
"or pass --backend.",
file=sys.stderr,
)
Expand Down Expand Up @@ -2677,11 +2701,16 @@ def _parse_float(name: str, raw: str) -> float:
cached_nodes, cached_edges, cached_hyperedges, uncached_paths = (
_check_semantic_cache(sem_paths_str, root=target)
)
cached_fragment = _sanitize_extraction_result({
"nodes": cached_nodes,
"edges": cached_edges,
"hyperedges": cached_hyperedges,
})
sem_cache_hits = len(semantic_files) - len(uncached_paths)
sem_cache_misses = len(uncached_paths)
sem_result["nodes"].extend(cached_nodes)
sem_result["edges"].extend(cached_edges)
sem_result["hyperedges"].extend(cached_hyperedges)
sem_result["nodes"].extend(cached_fragment["nodes"])
sem_result["edges"].extend(cached_fragment["edges"])
sem_result["hyperedges"].extend(cached_fragment["hyperedges"])
if sem_cache_hits:
print(f"[graphify extract] semantic cache: {sem_cache_hits} hit / {sem_cache_misses} miss")

Expand Down Expand Up @@ -2722,6 +2751,7 @@ def _progress(idx: int, total: int, _result: dict) -> None:
file=sys.stderr,
)
fresh = {"nodes": [], "edges": [], "hyperedges": [], "input_tokens": 0, "output_tokens": 0}
fresh = _sanitize_extraction_result(fresh)
try:
_save_semantic_cache(
fresh.get("nodes", []),
Expand All @@ -2748,6 +2778,7 @@ def _progress(idx: int, total: int, _result: dict) -> None:
"input_tokens": ast_result.get("input_tokens", 0) + sem_result.get("input_tokens", 0),
"output_tokens": ast_result.get("output_tokens", 0) + sem_result.get("output_tokens", 0),
}
merged = _sanitize_extraction_result(merged)

graph_json_path = graphify_out / "graph.json"
analysis_path = graphify_out / ".graphify_analysis.json"
Expand Down
46 changes: 36 additions & 10 deletions graphify/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,18 @@ def _norm_source_file(p: str | None, root: str | None = None) -> str | None:
return p


def _dict_items(value: object) -> list[dict]:
"""Return only dict entries from a graph list."""
if not isinstance(value, list):
return []
return [item for item in value if isinstance(item, dict)]


def _label_from_id(node_id: str) -> str:
"""Derive a readable fallback label from a node id."""
return " ".join(part for part in str(node_id).replace("-", "_").split("_") if part).title()


def edge_data(G: nx.Graph, u: str, v: str) -> dict:
"""Return one edge attribute dict for (u, v), tolerating MultiGraph.

Expand Down Expand Up @@ -112,15 +124,17 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
root: if given, absolute source_file paths from semantic subagents are made
relative to root so all nodes share a consistent path key (#932).
"""
extraction = dict(extraction)
_root = str(Path(root).resolve()) if root else None
# NetworkX <= 3.1 serialised edges as "links"; remap to "edges" for compatibility.
if "edges" not in extraction and "links" in extraction:
extraction = dict(extraction, edges=extraction["links"])
extraction["edges"] = extraction["links"]

for key in ("nodes", "edges", "hyperedges"):
extraction[key] = _dict_items(extraction.get(key))

# Canonicalize legacy node/edge schema before validation.
for node in extraction.get("nodes", []):
if not isinstance(node, dict):
continue
if "source" in node and "source_file" not in node:
# Count edges that reference this node so the warning is actionable (#479)
node_id = node.get("id", "?")
Expand All @@ -135,6 +149,8 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
file=sys.stderr,
)
node["source_file"] = node.pop("source")
if not node.get("label"):
node["label"] = _label_from_id(node.get("id", ""))
# Default missing/None file_type to "concept" so legacy graph.json
# entries (and stub nodes preserved by `_rebuild_code` from older
# graphify versions that didn't always populate file_type) don't
Expand All @@ -145,6 +161,20 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
if ft and ft not in {"code", "document", "paper", "image", "rationale", "concept"}:
node["file_type"] = _FILE_TYPE_SYNONYMS.get(ft, "concept")

for edge in extraction.get("edges", []):
if "confience_score" in edge:
typo_score = edge.pop("confience_score")
if "confidence_score" not in edge:
edge["confidence_score"] = typo_score
if "source" not in edge and "from" in edge:
edge["source"] = edge["from"]
if "target" not in edge and "to" in edge:
edge["target"] = edge["to"]
if not edge.get("relation"):
edge["relation"] = "conceptually_related_to"
if not edge.get("source_file"):
edge["source_file"] = "unknown"

errors = validate_extraction(extraction)
# Dangling edges (stdlib/external imports) are expected - only warn about real schema errors.
real_errors = [e for e in errors if "does not match any node id" not in e]
Expand All @@ -161,10 +191,6 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
# e.g. "Session_ValidateToken" maps to "session_validatetoken".
norm_to_id: dict[str, str] = {_normalize_id(nid): nid for nid in node_set}
for edge in extraction.get("edges", []):
if "source" not in edge and "from" in edge:
edge["source"] = edge["from"]
if "target" not in edge and "to" in edge:
edge["target"] = edge["to"]
if "source" not in edge or "target" not in edge:
continue
src, tgt = edge["source"], edge["target"]
Expand Down Expand Up @@ -214,9 +240,9 @@ def build(
from graphify.dedup import deduplicate_entities
combined: dict = {"nodes": [], "edges": [], "hyperedges": [], "input_tokens": 0, "output_tokens": 0}
for ext in extractions:
combined["nodes"].extend(ext.get("nodes", []))
combined["edges"].extend(ext.get("edges", []))
combined["hyperedges"].extend(ext.get("hyperedges", []))
combined["nodes"].extend(_dict_items(ext.get("nodes", [])))
combined["edges"].extend(_dict_items(ext.get("edges", [])))
combined["hyperedges"].extend(_dict_items(ext.get("hyperedges", [])))
combined["input_tokens"] += ext.get("input_tokens", 0)
combined["output_tokens"] += ext.get("output_tokens", 0)
if dedup and combined["nodes"]:
Expand Down
19 changes: 16 additions & 3 deletions graphify/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@
_GRAPHIFY_OUT = os.environ.get("GRAPHIFY_OUT", "graphify-out")


def _dict_items(value: object) -> list[dict]:
"""Return only dict entries from a cached/extracted graph list."""
if not isinstance(value, list):
return []
return [item for item in value if isinstance(item, dict)]


def _body_content(content: bytes) -> bytes:
"""Strip YAML frontmatter from Markdown content, returning only the body."""
text = content.decode(errors="replace")
Expand Down Expand Up @@ -280,9 +287,9 @@ def check_semantic_cache(
p = Path(root) / p
result = load_cached(p, root, kind="semantic")
if result is not None:
cached_nodes.extend(result.get("nodes", []))
cached_edges.extend(result.get("edges", []))
cached_hyperedges.extend(result.get("hyperedges", []))
cached_nodes.extend(_dict_items(result.get("nodes", [])))
cached_edges.extend(_dict_items(result.get("edges", [])))
cached_hyperedges.extend(_dict_items(result.get("hyperedges", [])))
else:
uncached.append(fpath)

Expand All @@ -306,14 +313,20 @@ def save_semantic_cache(

by_file: dict[str, dict] = defaultdict(lambda: {"nodes": [], "edges": [], "hyperedges": []})
for n in nodes:
if not isinstance(n, dict):
continue
src = n.get("source_file", "")
if src:
by_file[src]["nodes"].append(n)
for e in edges:
if not isinstance(e, dict):
continue
src = e.get("source_file", "")
if src:
by_file[src]["edges"].append(e)
for h in (hyperedges or []):
if not isinstance(h, dict):
continue
src = h.get("source_file", "")
if src:
by_file[src]["hyperedges"].append(h)
Expand Down
17 changes: 17 additions & 0 deletions graphify/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,11 @@ def _git_head() -> str | None:
return None


def _label_from_id(node_id: str) -> str:
"""Derive a readable fallback label from a node id."""
return " ".join(part for part in str(node_id).replace("-", "_").split("_") if part).title()


def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *, force: bool = False, built_at_commit: str | None = None) -> bool:
# Safety check: refuse to silently shrink an existing graph (#479)
existing_path = Path(output_path)
Expand Down Expand Up @@ -499,9 +504,21 @@ def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *,
except TypeError:
data = json_graph.node_link_data(G)
for node in data["nodes"]:
if not node.get("label"):
node["label"] = _label_from_id(node.get("id", ""))
if not node.get("source_file"):
node["source_file"] = "unknown"
node["community"] = node_community.get(node["id"])
node["norm_label"] = _strip_diacritics(node.get("label", "")).lower()
for link in data["links"]:
if "confience_score" in link:
typo_score = link.pop("confience_score")
if "confidence_score" not in link:
link["confidence_score"] = typo_score
if not link.get("relation"):
link["relation"] = "conceptually_related_to"
if not link.get("source_file"):
link["source_file"] = "unknown"
if "confidence_score" not in link:
conf = link.get("confidence", "EXTRACTED")
link["confidence_score"] = _CONFIDENCE_SCORE_DEFAULTS.get(conf, 1.0)
Expand Down
Loading