diff --git a/README.md b/README.md index 0ece9978a..c2c9bad1b 100644 --- a/README.md +++ b/README.md @@ -323,6 +323,7 @@ These are only needed for **headless / CI extraction** (`graphify extract`). Whe | `GEMINI_API_KEY` or `GOOGLE_API_KEY` | Google Gemini backend | `--backend gemini` | | `OPENAI_API_KEY` | OpenAI or OpenAI-compatible APIs | `--backend openai` | | `DEEPSEEK_API_KEY` | DeepSeek backend | `--backend deepseek` | +| `OPENROUTER_API_KEY` | OpenRouter DeepSeek/Kimi backends | default for extraction via `openrouter-deepseek`; explicit `--backend openrouter-deepseek` or `--backend openrouter-kimi` | | `MOONSHOT_API_KEY` | Kimi Code backend | `--backend kimi` | | `OLLAMA_BASE_URL` | Ollama local inference URL | `--backend ollama` (default: `http://localhost:11434`) | | `OLLAMA_MODEL` | Ollama model name | `--backend ollama` (default: auto-detect) | @@ -343,7 +344,7 @@ These are only needed for **headless / CI extraction** (`graphify extract`). Whe - **Code files** — processed locally via tree-sitter. Nothing leaves your machine. - **Video / audio** — transcribed locally with faster-whisper. Nothing leaves your machine. -- **Docs, PDFs, images** — sent to your AI assistant for semantic extraction (via the `/graphify` skill, using whatever model your IDE session runs). Headless `graphify extract` requires `GEMINI_API_KEY` / `GOOGLE_API_KEY` (Gemini), `MOONSHOT_API_KEY` (Kimi), `ANTHROPIC_API_KEY` (Claude), `OPENAI_API_KEY` (OpenAI), `DEEPSEEK_API_KEY` (DeepSeek), a running Ollama instance (`OLLAMA_BASE_URL`), AWS credentials via the standard provider chain (Bedrock - no API key needed, uses IAM), or the `claude` CLI binary (Claude Code - no API key needed, uses your Claude subscription). The `--dedup-llm` flag uses the same key. +- **Docs, PDFs, images** — sent to your AI assistant for semantic extraction (via the `/graphify` skill, using whatever model your IDE session runs). Headless `graphify extract` defaults to OpenRouter DeepSeek when `OPENROUTER_API_KEY` is set, or can use `DEEPSEEK_API_KEY` (DeepSeek), `GEMINI_API_KEY` / `GOOGLE_API_KEY` (Gemini), `MOONSHOT_API_KEY` (Kimi), `ANTHROPIC_API_KEY` (Claude), `OPENAI_API_KEY` (OpenAI), a running Ollama instance (`OLLAMA_BASE_URL`), AWS credentials via the standard provider chain (Bedrock - no API key needed, uses IAM), or the `claude` CLI binary (Claude Code - no API key needed, uses your Claude subscription). The `--dedup-llm` flag uses the same key. - No telemetry, no usage tracking, no analytics. --- @@ -453,7 +454,8 @@ graphify kiro install / uninstall graphify antigravity install / uninstall graphify extract ./docs # headless LLM extraction for CI (no IDE needed) -graphify extract ./docs --backend gemini # explicit backend: gemini, kimi, claude, openai, deepseek, ollama, bedrock, or claude-cli +graphify extract ./docs --backend openrouter-deepseek # explicit OpenRouter DeepSeek backend (default when OPENROUTER_API_KEY is set) +graphify extract ./docs --backend gemini # explicit backend: openrouter-deepseek, openrouter-kimi, deepseek, gemini, kimi, claude, openai, ollama, bedrock, or claude-cli graphify extract ./docs --backend gemini --model gemini-3.1-pro-preview graphify extract ./docs --backend ollama # local Ollama (set OLLAMA_BASE_URL / OLLAMA_MODEL) - no API key needed for loopback GRAPHIFY_OLLAMA_NUM_CTX=32768 graphify extract ./docs --backend ollama # override KV-cache window (auto-sized by default) @@ -470,6 +472,7 @@ graphify extract ./docs --force # overwrite graph.json even if ne graphify extract ./docs --dedup-llm # LLM tiebreaker for ambiguous entity pairs (uses same API key) graphify extract ./docs --global --as myrepo # extract and register into the cross-project global graph GRAPHIFY_MAX_OUTPUT_TOKENS=32768 graphify extract ./docs --backend claude # raise output cap for dense corpora +graphify quality graphify-out/graph.json # schema-quality gate for generated graph JSON graphify export callflow-html # graphify-out/-callflow.html graphify export callflow-html --max-sections 8 # cap generated architecture sections diff --git a/graphify/__main__.py b/graphify/__main__.py index 895f626ef..4e5ddcab4 100644 --- a/graphify/__main__.py +++ b/graphify/__main__.py @@ -1241,6 +1241,8 @@ def main() -> None: print(" --context C explicit edge-context filter (repeatable)") print(" --budget N cap output at N tokens (default 2000)") print(" --graph path to graph.json (default graphify-out/graph.json)") + print(" quality [graph.json] inspect graph.json schema quality") + print(" --json emit machine-readable quality report") print(" save-result save a Q&A result to graphify-out/memory/ for graph feedback loop") print(" --question Q the question asked") print(" --answer A the answer to save") @@ -1256,7 +1258,7 @@ def main() -> None: print(" --top-k-edges N per-symbol outbound edges in inspector (default 12)") print(" --label NAME project label in header") print(" extract headless full extraction (AST + semantic LLM) for CI/scripts") - print(" --backend B gemini|kimi|claude|openai|deepseek|ollama (default: whichever API key is set)") + print(" --backend B openrouter-deepseek|openrouter-kimi|deepseek|gemini|kimi|claude|openai|ollama (default: OpenRouter DeepSeek when OPENROUTER_API_KEY is set)") print(" --model M override backend default model") print(" --max-workers N AST extraction subprocess count (default: cpu_count)") print(" --token-budget N per-chunk token cap for semantic extraction (default: 60000)") @@ -1576,6 +1578,26 @@ def main() -> None: source_nodes=opts.nodes or None, ) print(f"Saved to {out}") + elif cmd == "quality": + from graphify.quality import format_report, inspect_graph + graph_path = Path(_default_graph_path()) + emit_json = False + for arg in sys.argv[2:]: + if arg == "--json": + emit_json = True + else: + graph_path = Path(arg) + try: + report = inspect_graph(graph_path) + except Exception as exc: + print(f"error: could not inspect graph quality: {exc}", file=sys.stderr) + sys.exit(1) + if emit_json: + print(json.dumps(report, indent=2)) + else: + print(format_report(report)) + if report["status"] != "pass": + sys.exit(1) elif cmd == "path": if len(sys.argv) < 4: print("Usage: graphify path \"\" \"\" [--graph path]", file=sys.stderr) @@ -2402,7 +2424,7 @@ def _load_graph(p: str): # has an API key set. if len(sys.argv) < 3: print( - "Usage: graphify extract [--backend gemini|kimi|claude|openai|deepseek|ollama] " + "Usage: graphify extract [--backend openrouter-deepseek|openrouter-kimi|deepseek|gemini|kimi|claude|openai|ollama] " "[--model M] [--out DIR] [--google-workspace] [--no-cluster] " "[--max-workers N] [--token-budget N] [--max-concurrency N] " "[--api-timeout S]", @@ -2525,14 +2547,16 @@ def _parse_float(name: str, raw: str) -> float: extract_corpus_parallel as _extract_corpus_parallel, _format_backend_env_keys, _get_backend_api_key, + _sanitize_extraction_result, ) if backend is None: backend = _detect_backend() if backend is None: print( - "error: no LLM API key found. Set GEMINI_API_KEY or GOOGLE_API_KEY " - "(gemini), MOONSHOT_API_KEY (kimi), ANTHROPIC_API_KEY (claude), " - "OPENAI_API_KEY (openai), DEEPSEEK_API_KEY (deepseek), " + "error: no LLM API key found. Set OPENROUTER_API_KEY " + "(default openrouter-deepseek), DEEPSEEK_API_KEY (deepseek), " + "GEMINI_API_KEY or GOOGLE_API_KEY (gemini), MOONSHOT_API_KEY (kimi), " + "ANTHROPIC_API_KEY (claude), OPENAI_API_KEY (openai), " "or pass --backend.", file=sys.stderr, ) @@ -2677,11 +2701,16 @@ def _parse_float(name: str, raw: str) -> float: cached_nodes, cached_edges, cached_hyperedges, uncached_paths = ( _check_semantic_cache(sem_paths_str, root=target) ) + cached_fragment = _sanitize_extraction_result({ + "nodes": cached_nodes, + "edges": cached_edges, + "hyperedges": cached_hyperedges, + }) sem_cache_hits = len(semantic_files) - len(uncached_paths) sem_cache_misses = len(uncached_paths) - sem_result["nodes"].extend(cached_nodes) - sem_result["edges"].extend(cached_edges) - sem_result["hyperedges"].extend(cached_hyperedges) + sem_result["nodes"].extend(cached_fragment["nodes"]) + sem_result["edges"].extend(cached_fragment["edges"]) + sem_result["hyperedges"].extend(cached_fragment["hyperedges"]) if sem_cache_hits: print(f"[graphify extract] semantic cache: {sem_cache_hits} hit / {sem_cache_misses} miss") @@ -2722,6 +2751,7 @@ def _progress(idx: int, total: int, _result: dict) -> None: file=sys.stderr, ) fresh = {"nodes": [], "edges": [], "hyperedges": [], "input_tokens": 0, "output_tokens": 0} + fresh = _sanitize_extraction_result(fresh) try: _save_semantic_cache( fresh.get("nodes", []), @@ -2748,6 +2778,7 @@ def _progress(idx: int, total: int, _result: dict) -> None: "input_tokens": ast_result.get("input_tokens", 0) + sem_result.get("input_tokens", 0), "output_tokens": ast_result.get("output_tokens", 0) + sem_result.get("output_tokens", 0), } + merged = _sanitize_extraction_result(merged) graph_json_path = graphify_out / "graph.json" analysis_path = graphify_out / ".graphify_analysis.json" diff --git a/graphify/build.py b/graphify/build.py index cc229fdaa..9b0025b40 100644 --- a/graphify/build.py +++ b/graphify/build.py @@ -83,6 +83,18 @@ def _norm_source_file(p: str | None, root: str | None = None) -> str | None: return p +def _dict_items(value: object) -> list[dict]: + """Return only dict entries from a graph list.""" + if not isinstance(value, list): + return [] + return [item for item in value if isinstance(item, dict)] + + +def _label_from_id(node_id: str) -> str: + """Derive a readable fallback label from a node id.""" + return " ".join(part for part in str(node_id).replace("-", "_").split("_") if part).title() + + def edge_data(G: nx.Graph, u: str, v: str) -> dict: """Return one edge attribute dict for (u, v), tolerating MultiGraph. @@ -112,15 +124,17 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat root: if given, absolute source_file paths from semantic subagents are made relative to root so all nodes share a consistent path key (#932). """ + extraction = dict(extraction) _root = str(Path(root).resolve()) if root else None # NetworkX <= 3.1 serialised edges as "links"; remap to "edges" for compatibility. if "edges" not in extraction and "links" in extraction: - extraction = dict(extraction, edges=extraction["links"]) + extraction["edges"] = extraction["links"] + + for key in ("nodes", "edges", "hyperedges"): + extraction[key] = _dict_items(extraction.get(key)) # Canonicalize legacy node/edge schema before validation. for node in extraction.get("nodes", []): - if not isinstance(node, dict): - continue if "source" in node and "source_file" not in node: # Count edges that reference this node so the warning is actionable (#479) node_id = node.get("id", "?") @@ -135,6 +149,8 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat file=sys.stderr, ) node["source_file"] = node.pop("source") + if not node.get("label"): + node["label"] = _label_from_id(node.get("id", "")) # Default missing/None file_type to "concept" so legacy graph.json # entries (and stub nodes preserved by `_rebuild_code` from older # graphify versions that didn't always populate file_type) don't @@ -145,6 +161,20 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat if ft and ft not in {"code", "document", "paper", "image", "rationale", "concept"}: node["file_type"] = _FILE_TYPE_SYNONYMS.get(ft, "concept") + for edge in extraction.get("edges", []): + if "confience_score" in edge: + typo_score = edge.pop("confience_score") + if "confidence_score" not in edge: + edge["confidence_score"] = typo_score + if "source" not in edge and "from" in edge: + edge["source"] = edge["from"] + if "target" not in edge and "to" in edge: + edge["target"] = edge["to"] + if not edge.get("relation"): + edge["relation"] = "conceptually_related_to" + if not edge.get("source_file"): + edge["source_file"] = "unknown" + errors = validate_extraction(extraction) # Dangling edges (stdlib/external imports) are expected - only warn about real schema errors. real_errors = [e for e in errors if "does not match any node id" not in e] @@ -161,10 +191,6 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat # e.g. "Session_ValidateToken" maps to "session_validatetoken". norm_to_id: dict[str, str] = {_normalize_id(nid): nid for nid in node_set} for edge in extraction.get("edges", []): - if "source" not in edge and "from" in edge: - edge["source"] = edge["from"] - if "target" not in edge and "to" in edge: - edge["target"] = edge["to"] if "source" not in edge or "target" not in edge: continue src, tgt = edge["source"], edge["target"] @@ -214,9 +240,9 @@ def build( from graphify.dedup import deduplicate_entities combined: dict = {"nodes": [], "edges": [], "hyperedges": [], "input_tokens": 0, "output_tokens": 0} for ext in extractions: - combined["nodes"].extend(ext.get("nodes", [])) - combined["edges"].extend(ext.get("edges", [])) - combined["hyperedges"].extend(ext.get("hyperedges", [])) + combined["nodes"].extend(_dict_items(ext.get("nodes", []))) + combined["edges"].extend(_dict_items(ext.get("edges", []))) + combined["hyperedges"].extend(_dict_items(ext.get("hyperedges", []))) combined["input_tokens"] += ext.get("input_tokens", 0) combined["output_tokens"] += ext.get("output_tokens", 0) if dedup and combined["nodes"]: diff --git a/graphify/cache.py b/graphify/cache.py index 2052cf7aa..11813f131 100644 --- a/graphify/cache.py +++ b/graphify/cache.py @@ -14,6 +14,13 @@ _GRAPHIFY_OUT = os.environ.get("GRAPHIFY_OUT", "graphify-out") +def _dict_items(value: object) -> list[dict]: + """Return only dict entries from a cached/extracted graph list.""" + if not isinstance(value, list): + return [] + return [item for item in value if isinstance(item, dict)] + + def _body_content(content: bytes) -> bytes: """Strip YAML frontmatter from Markdown content, returning only the body.""" text = content.decode(errors="replace") @@ -280,9 +287,9 @@ def check_semantic_cache( p = Path(root) / p result = load_cached(p, root, kind="semantic") if result is not None: - cached_nodes.extend(result.get("nodes", [])) - cached_edges.extend(result.get("edges", [])) - cached_hyperedges.extend(result.get("hyperedges", [])) + cached_nodes.extend(_dict_items(result.get("nodes", []))) + cached_edges.extend(_dict_items(result.get("edges", []))) + cached_hyperedges.extend(_dict_items(result.get("hyperedges", []))) else: uncached.append(fpath) @@ -306,14 +313,20 @@ def save_semantic_cache( by_file: dict[str, dict] = defaultdict(lambda: {"nodes": [], "edges": [], "hyperedges": []}) for n in nodes: + if not isinstance(n, dict): + continue src = n.get("source_file", "") if src: by_file[src]["nodes"].append(n) for e in edges: + if not isinstance(e, dict): + continue src = e.get("source_file", "") if src: by_file[src]["edges"].append(e) for h in (hyperedges or []): + if not isinstance(h, dict): + continue src = h.get("source_file", "") if src: by_file[src]["hyperedges"].append(h) diff --git a/graphify/export.py b/graphify/export.py index a71c927c7..47129893c 100644 --- a/graphify/export.py +++ b/graphify/export.py @@ -472,6 +472,11 @@ def _git_head() -> str | None: return None +def _label_from_id(node_id: str) -> str: + """Derive a readable fallback label from a node id.""" + return " ".join(part for part in str(node_id).replace("-", "_").split("_") if part).title() + + def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *, force: bool = False, built_at_commit: str | None = None) -> bool: # Safety check: refuse to silently shrink an existing graph (#479) existing_path = Path(output_path) @@ -499,9 +504,21 @@ def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *, except TypeError: data = json_graph.node_link_data(G) for node in data["nodes"]: + if not node.get("label"): + node["label"] = _label_from_id(node.get("id", "")) + if not node.get("source_file"): + node["source_file"] = "unknown" node["community"] = node_community.get(node["id"]) node["norm_label"] = _strip_diacritics(node.get("label", "")).lower() for link in data["links"]: + if "confience_score" in link: + typo_score = link.pop("confience_score") + if "confidence_score" not in link: + link["confidence_score"] = typo_score + if not link.get("relation"): + link["relation"] = "conceptually_related_to" + if not link.get("source_file"): + link["source_file"] = "unknown" if "confidence_score" not in link: conf = link.get("confidence", "EXTRACTED") link["confidence_score"] = _CONFIDENCE_SCORE_DEFAULTS.get(conf, 1.0) diff --git a/graphify/llm.py b/graphify/llm.py index 58786f681..b3395cb2d 100644 --- a/graphify/llm.py +++ b/graphify/llm.py @@ -1,5 +1,5 @@ # Direct LLM backend for semantic extraction — supports Claude, Kimi K2.6, -# Gemini, and OpenAI. +# Gemini, OpenAI, DeepSeek, and OpenRouter. # Used by `graphify extract . --backend gemini` and the benchmark scripts. # The default graphify pipeline uses Claude Code subagents via skill.md; # this module provides a direct API path for non-Claude-Code environments. @@ -7,6 +7,7 @@ import json import os +import shutil import sys import time from collections.abc import Callable @@ -98,6 +99,24 @@ def _get_tokenizer(): "temperature": 0, "max_tokens": 16384, }, + "openrouter-deepseek": { + "base_url": "https://openrouter.ai/api/v1", + "default_model": "deepseek/deepseek-v4-flash", + "env_key": "OPENROUTER_API_KEY", + "model_env_key": "GRAPHIFY_OPENROUTER_DEEPSEEK_MODEL", + "pricing": {"input": 0.14, "output": 0.28}, # placeholder; OpenRouter billing is authoritative + "temperature": 0, + "max_tokens": 16384, + }, + "openrouter-kimi": { + "base_url": "https://openrouter.ai/api/v1", + "default_model": "moonshotai/kimi-k2.6", + "env_key": "OPENROUTER_API_KEY", + "model_env_key": "GRAPHIFY_OPENROUTER_KIMI_MODEL", + "pricing": {"input": 0.74, "output": 4.66}, # placeholder; OpenRouter billing is authoritative + "temperature": 0, + "max_tokens": 8192, + }, "bedrock": { "default_model": "anthropic.claude-3-5-sonnet-20241022-v2:0", "model_env_key": "GRAPHIFY_BEDROCK_MODEL", @@ -138,6 +157,13 @@ def _resolve_max_tokens(default: int) -> int: - EXTRACTED: relationship explicit in source (import, call, citation, reference) - INFERRED: reasonable inference (shared data structure, implied dependency) - AMBIGUOUS: uncertain — flag for review, do not omit +- Prefer fewer, better nodes over exhaustive extraction. +- Extract at most 12 nodes per input file and at most 24 edges per input file. +- Omit generic low-value concepts unless they connect two specific source artifacts. +- Keep labels short. Keep relation values from the schema list only. +- Every item inside nodes, edges, and hyperedges must be a JSON object. Never emit strings in those arrays. +- Use the field name source_file for provenance. Do not use source except as an edge endpoint. +- Use confidence_score exactly. Do not misspell it. Node ID format: lowercase, only [a-z0-9_], no dots or slashes. Format: {stem}_{entity} where stem = filename without extension, entity = symbol name (both normalised). @@ -166,6 +192,42 @@ def _read_files(paths: list[Path], root: Path) -> str: _LLM_JSON_MAX_BYTES = 10 * 1024 * 1024 # 10 MB hard cap before json.loads (F-016) +def _merge_extra_body(kwargs: dict, extra_body: dict) -> None: + """Merge provider-specific OpenAI SDK extra_body values.""" + current = kwargs.get("extra_body") + if isinstance(current, dict): + merged = dict(current) + merged.update(extra_body) + kwargs["extra_body"] = merged + else: + kwargs["extra_body"] = extra_body + + +def _strip_json_fences(raw: str) -> str: + """Remove common markdown/code-fence wrapping around a JSON object.""" + raw = raw.strip() + if raw.startswith("```"): + raw = raw.split("```", 2)[1] + if raw.lstrip().startswith("json"): + raw = raw.lstrip()[4:] + raw = raw.rsplit("```", 1)[0] + return raw.strip() + + +def _remove_json_control_chars(raw: str) -> str: + """Drop ASCII control characters that make otherwise-valid JSON fail.""" + return "".join(ch for ch in raw if ch in "\t\n\r" or ord(ch) >= 32) + + +def _extract_json_object(raw: str) -> str: + """Return the outermost JSON object substring if prose leaked around it.""" + start = raw.find("{") + end = raw.rfind("}") + if start != -1 and end != -1 and end > start: + return raw[start:end + 1] + return raw + + def _parse_llm_json(raw: str) -> dict: """Strip optional markdown fences and parse JSON. Returns empty fragment on failure. @@ -179,17 +241,50 @@ def _parse_llm_json(raw: str) -> dict: file=sys.stderr, ) return {"nodes": [], "edges": [], "hyperedges": []} - if raw.startswith("```"): - raw = raw.split("```", 2)[1] - if raw.startswith("json"): - raw = raw[4:] - raw = raw.rsplit("```", 1)[0] - try: - return json.loads(raw.strip()) - except json.JSONDecodeError as exc: - print(f"[graphify] LLM returned invalid JSON, skipping chunk: {exc}", file=sys.stderr) + candidates = [ + raw, + _strip_json_fences(raw), + _extract_json_object(_strip_json_fences(raw)), + _remove_json_control_chars(_extract_json_object(_strip_json_fences(raw))), + ] + last_exc: json.JSONDecodeError | None = None + for candidate in candidates: + try: + return json.loads(candidate.strip()) + except json.JSONDecodeError as exc: + last_exc = exc + if last_exc is not None: + print(f"[graphify] LLM returned invalid JSON, skipping chunk: {last_exc}", file=sys.stderr) + else: + print("[graphify] LLM returned invalid JSON, skipping chunk", file=sys.stderr) + return {"nodes": [], "edges": [], "hyperedges": []} + + +def _sanitize_extraction_result(result: dict) -> dict: + """Normalize parsed LLM output to lists of dicts expected downstream.""" + if not isinstance(result, dict): return {"nodes": [], "edges": [], "hyperedges": []} + cleaned = dict(result) + dropped = 0 + for key in ("nodes", "edges", "hyperedges"): + value = cleaned.get(key) + if not isinstance(value, list): + if value is not None: + dropped += 1 + cleaned[key] = [] + continue + good = [item for item in value if isinstance(item, dict)] + dropped += len(value) - len(good) + cleaned[key] = good + + if dropped: + print( + f"[graphify] dropped {dropped} malformed LLM graph item(s)", + file=sys.stderr, + ) + return cleaned + def _response_is_hollow(raw_content: str | None, parsed: dict) -> bool: """Detect a successful HTTP response that yielded no usable extraction. @@ -232,6 +327,18 @@ def _get_backend_api_key(backend: str) -> str: return "" +def _backend_is_configured(backend: str) -> bool: + """Return whether a backend has the credentials or local runtime needed.""" + if backend == "ollama": + _validate_ollama_base_url(os.environ.get("OLLAMA_BASE_URL", BACKENDS[backend]["base_url"])) + return True + if backend == "bedrock": + return bool(os.environ.get("AWS_PROFILE") or os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION")) + if backend == "claude-cli": + return bool(shutil.which("claude")) + return bool(_get_backend_api_key(backend)) + + def _format_backend_env_keys(backend: str) -> str: """Return user-facing accepted API-key variable names.""" keys = _backend_env_keys(backend) @@ -291,15 +398,22 @@ def _call_openai_compat( {"role": "system", "content": _EXTRACTION_SYSTEM}, {"role": "user", "content": user_message}, ], - "max_completion_tokens": max_completion_tokens, } + if "openrouter.ai" in base_url: + # OpenRouter's OpenAI-compatible surface is happiest with max_tokens; + # some upstream providers reject max_completion_tokens. + kwargs["max_tokens"] = max_completion_tokens + kwargs["response_format"] = {"type": "json_object"} + _merge_extra_body(kwargs, {"provider": {"require_parameters": True}}) + else: + kwargs["max_completion_tokens"] = max_completion_tokens if temperature is not None: kwargs["temperature"] = temperature if reasoning_effort is not None: kwargs["reasoning_effort"] = reasoning_effort # Kimi-k2.6 is a reasoning model — disable thinking so content isn't empty if "moonshot" in base_url: - kwargs["extra_body"] = {"thinking": {"type": "disabled"}} + _merge_extra_body(kwargs, {"thinking": {"type": "disabled"}}) # Ollama defaults num_ctx to 2048 and silently truncates prompts larger # than that — the symptom is hollow 200 OK responses after the first few # chunks (#798). We derive num_ctx from the actual prompt size so we don't @@ -348,7 +462,7 @@ def _call_openai_compat( if not resp.choices or resp.choices[0].message is None: raise ValueError("LLM returned empty or filtered response") raw_content = resp.choices[0].message.content - result = _parse_llm_json(raw_content or "{}") + result = _sanitize_extraction_result(_parse_llm_json(raw_content or "{}")) result["input_tokens"] = resp.usage.prompt_tokens if resp.usage else 0 result["output_tokens"] = resp.usage.completion_tokens if resp.usage else 0 result["model"] = model @@ -402,7 +516,7 @@ def _call_claude(api_key: str, model: str, user_message: str, max_tokens: int = messages=[{"role": "user", "content": user_message}], ) raw_content = resp.content[0].text if resp.content else None - result = _parse_llm_json(raw_content or "{}") + result = _sanitize_extraction_result(_parse_llm_json(raw_content or "{}")) result["input_tokens"] = resp.usage.input_tokens if resp.usage else 0 result["output_tokens"] = resp.usage.output_tokens if resp.usage else 0 result["model"] = model @@ -427,7 +541,6 @@ def _call_claude_cli(user_message: str, max_tokens: int = 8192) -> dict: ANTHROPIC_API_KEY. Useful for Pro/Max subscribers who don't want to provision a pay-as-you-go API key just to run graphify's semantic pass. """ - import shutil import subprocess if shutil.which("claude") is None: @@ -464,7 +577,7 @@ def _call_claude_cli(user_message: str, max_tokens: int = 8192) -> dict: ) from exc raw_content = envelope.get("result", "") - result = _parse_llm_json(raw_content or "{}") + result = _sanitize_extraction_result(_parse_llm_json(raw_content or "{}")) usage = envelope.get("usage") or {} result["input_tokens"] = ( int(usage.get("input_tokens", 0) or 0) @@ -514,7 +627,7 @@ def _call_bedrock(model: str, user_message: str, max_tokens: int = 8192) -> dict raise RuntimeError(f"Bedrock API error ({code}): {msg}") from exc text = resp.get("output", {}).get("message", {}).get("content", [{}])[0].get("text", "{}") - result = _parse_llm_json(text) + result = _sanitize_extraction_result(_parse_llm_json(text)) usage = resp.get("usage", {}) result["input_tokens"] = usage.get("inputTokens", 0) result["output_tokens"] = usage.get("outputTokens", 0) @@ -985,7 +1098,7 @@ def _call_llm(prompt: str, *, backend: str, max_tokens: int = 200) -> str: return resp.content[0].text if resp.content else "" if backend == "claude-cli": - import shutil, subprocess + import subprocess if shutil.which("claude") is None: raise RuntimeError("Claude Code CLI not found on $PATH") proc = subprocess.run( @@ -1021,7 +1134,7 @@ def _call_llm(prompt: str, *, backend: str, max_tokens: int = 200) -> str: ) return resp.get("output", {}).get("message", {}).get("content", [{}])[0].get("text", "") - # OpenAI-compatible (kimi, openai, gemini, ollama) + # OpenAI-compatible (Kimi, OpenAI, Gemini, Ollama, OpenRouter) try: from openai import OpenAI except ImportError as exc: @@ -1030,8 +1143,11 @@ def _call_llm(prompt: str, *, backend: str, max_tokens: int = 200) -> str: kwargs: dict = { "model": mdl, "messages": [{"role": "user", "content": prompt}], - "max_completion_tokens": max_tokens, } + if "openrouter.ai" in cfg["base_url"]: + kwargs["max_tokens"] = max_tokens + else: + kwargs["max_completion_tokens"] = max_tokens temperature = cfg.get("temperature", 0) if temperature is not None: kwargs["temperature"] = temperature @@ -1091,7 +1207,9 @@ def _validate_ollama_base_url(url: str) -> None: def detect_backend() -> str | None: """Return the name of whichever backend has an API key set, or None. - Priority: gemini → kimi → claude → openai → bedrock → ollama (last, opt-in). + Priority: GRAPHIFY_DEFAULT_BACKEND if valid and configured, then + OpenRouter DeepSeek → OpenRouter Kimi → direct DeepSeek → Gemini → Kimi → + Claude → OpenAI → Bedrock → Ollama (last, opt-in). Ollama is intentionally checked LAST so a paid API key (Anthropic/OpenAI/etc.) is never silently shadowed by an incidental OLLAMA_BASE_URL in the environment @@ -1099,7 +1217,11 @@ def detect_backend() -> str | None: key now keeps you on the paid backend; remove the paid key (or pass --backend ollama explicitly) to route to the local model. """ - for backend in ("gemini", "kimi", "claude", "openai", "deepseek"): + explicit = os.environ.get("GRAPHIFY_DEFAULT_BACKEND", "").strip() + if explicit in BACKENDS and _backend_is_configured(explicit): + return explicit + + for backend in ("openrouter-deepseek", "openrouter-kimi", "deepseek", "gemini", "kimi", "claude", "openai"): if _get_backend_api_key(backend): return backend if os.environ.get("AWS_PROFILE") or os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION"): diff --git a/graphify/prs.py b/graphify/prs.py index cd0bc0e74..544eccf36 100644 --- a/graphify/prs.py +++ b/graphify/prs.py @@ -541,8 +541,11 @@ def render_pr_detail(pr: PRInfo, repo: str | None = None) -> None: # Best model per backend for reasoning tasks (different from extraction defaults) _TRIAGE_MODEL_DEFAULTS: dict[str, str] = { + "openrouter-deepseek": "deepseek/deepseek-v4-flash", + "deepseek": "deepseek-v4-flash", "claude": "claude-opus-4-7", "kimi": "kimi-k2.6", + "openrouter-kimi": "moonshotai/kimi-k2.6", "openai": "gpt-4.1-mini", "gemini": "gemini-3-flash-preview", } @@ -559,7 +562,7 @@ def _resolve_triage_backend() -> tuple[str, str]: or _default_model_for_backend(explicit)) return explicit, model - for b in ("claude", "kimi", "openai", "gemini"): + for b in ("openrouter-deepseek", "openrouter-kimi", "deepseek", "claude", "kimi", "openai", "gemini"): if _get_backend_api_key(b): model = (os.environ.get("GRAPHIFY_TRIAGE_MODEL") or _TRIAGE_MODEL_DEFAULTS.get(b) @@ -570,7 +573,7 @@ def _resolve_triage_backend() -> tuple[str, str]: if shutil.which("claude"): return "claude-cli", "claude-code-plan" - return "ollama", _default_model_for_backend("ollama") + raise RuntimeError("No triage backend configured. Set OPENROUTER_API_KEY (preferred) or GRAPHIFY_TRIAGE_BACKEND; refusing to fall back to a local Ollama model by default.") def triage_with_opus(prs: list[PRInfo], base: str) -> None: @@ -624,7 +627,7 @@ def triage_with_opus(prs: list[PRInfo], base: str) -> None: print(text.replace("\n", "\n "), end="", flush=True) print("\n") - elif backend in ("kimi", "openai", "gemini", "ollama"): + elif backend in ("openrouter-deepseek", "deepseek", "kimi", "openrouter-kimi", "openai", "gemini", "ollama"): from openai import OpenAI cfg = BACKENDS[backend] api_key = _get_backend_api_key(backend) or "ollama" diff --git a/graphify/quality.py b/graphify/quality.py new file mode 100644 index 000000000..9f153da93 --- /dev/null +++ b/graphify/quality.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + + +def _edge_list(data: dict[str, Any]) -> list[Any]: + if "links" in data: + value = data.get("links") + else: + value = data.get("edges") + return value if isinstance(value, list) else [] + + +def _node_list(data: dict[str, Any]) -> list[Any]: + value = data.get("nodes", []) + return value if isinstance(value, list) else [] + + +def inspect_graph(path: str | Path) -> dict[str, Any]: + """Return schema-quality counters for a graphify graph.json file.""" + graph_path = Path(path) + data = json.loads(graph_path.read_text(encoding="utf-8")) + raw_nodes = data.get("nodes", []) + raw_edges = data.get("links") if "links" in data else data.get("edges", []) + nodes = _node_list(data) + edges = _edge_list(data) + node_dicts = [n for n in nodes if isinstance(n, dict)] + edge_dicts = [e for e in edges if isinstance(e, dict)] + node_ids = [n.get("id") for n in node_dicts if n.get("id")] + node_id_set = set(node_ids) + + dangling_edges = 0 + for edge in edge_dicts: + src = edge.get("source") + tgt = edge.get("target") + if src and src not in node_id_set: + dangling_edges += 1 + if tgt and tgt not in node_id_set: + dangling_edges += 1 + + issues = { + "non_object_nodes": len(nodes) - len(node_dicts) if isinstance(raw_nodes, list) else 1, + "non_object_edges": len(edges) - len(edge_dicts) if isinstance(raw_edges, list) else 1, + "missing_node_ids": sum(1 for n in node_dicts if not n.get("id")), + "missing_node_labels": sum(1 for n in node_dicts if not n.get("label")), + "missing_node_source_files": sum(1 for n in node_dicts if not n.get("source_file")), + "missing_edge_sources": sum(1 for e in edge_dicts if not e.get("source")), + "missing_edge_targets": sum(1 for e in edge_dicts if not e.get("target")), + "missing_edge_relations": sum(1 for e in edge_dicts if not e.get("relation")), + "missing_edge_confidences": sum(1 for e in edge_dicts if not e.get("confidence")), + "missing_edge_source_files": sum(1 for e in edge_dicts if not e.get("source_file")), + "typo_confience_score_edges": sum(1 for e in edge_dicts if "confience_score" in e), + "duplicate_node_ids": len(node_ids) - len(node_id_set), + "dangling_edge_endpoints": dangling_edges, + } + total_issues = sum(issues.values()) + return { + "path": str(graph_path), + "nodes": len(node_dicts), + "edges": len(edge_dicts), + "issues": issues, + "total_issues": total_issues, + "status": "pass" if total_issues == 0 else "fail", + } + + +def format_report(report: dict[str, Any]) -> str: + """Return a concise human-readable graph quality report.""" + lines = [ + f"Graph quality: {report['status']}", + f" path: {report['path']}", + f" nodes: {report['nodes']}", + f" edges: {report['edges']}", + f" total issues: {report['total_issues']}", + ] + for key, value in report["issues"].items(): + if value: + lines.append(f" {key}: {value}") + return "\n".join(lines) diff --git a/graphify/skill.md b/graphify/skill.md index c3e39b3f4..546a8cd6d 100644 --- a/graphify/skill.md +++ b/graphify/skill.md @@ -88,7 +88,7 @@ The skill pipeline writes all intermediate and final outputs to `graphify-out/` graphify extract ./core/ # → ./core/graphify-out/graph.json graphify extract ./service/ # → ./service/graphify-out/graph.json graphify extract ./platform/ # → ./platform/graphify-out/graph.json -# Add --backend gemini|kimi|openai|deepseek|claude-cli depending on which API key you have set +# Add --backend openrouter-deepseek|openrouter-kimi|deepseek|gemini|kimi|openai|claude-cli depending on which API key you have set # Then merge at the project root: graphify merge-graphs \ diff --git a/pyproject.toml b/pyproject.toml index cfdcd961d..f534be632 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ ollama = ["openai"] bedrock = ["boto3"] gemini = ["openai", "tiktoken"] openai = ["openai", "tiktoken"] +openrouter = ["openai", "tiktoken"] sql = ["tree-sitter-sql"] all = ["mcp", "neo4j", "pypdf", "markdownify", "watchdog", "graspologic; python_version < '3.13'", "python-docx", "openpyxl", "faster-whisper", "yt-dlp", "matplotlib", "openai", "tiktoken", "boto3", "tree-sitter-sql"] diff --git a/tests/test_build.py b/tests/test_build.py index 85d59fd5e..9b74f2979 100644 --- a/tests/test_build.py +++ b/tests/test_build.py @@ -52,6 +52,34 @@ def test_legacy_edge_from_to_canonicalized(): assert G.number_of_edges() == 1 +def test_malformed_llm_fields_are_canonicalized(): + ext = { + "nodes": [ + {"id": "memory_control_plane", "file_type": "document", "source_file": "a.md"}, + {"id": "dex_system", "label": "Dex System", "file_type": "document", "source_file": "b.md"}, + ], + "edges": [ + { + "source": "memory_control_plane", + "target": "dex_system", + "confidence": "INFERRED", + "confidence_score": 0.9, + "confience_score": 0.8, + "source_file": "", + } + ], + "input_tokens": 0, + "output_tokens": 0, + } + G = build_from_json(ext) + assert G.nodes["memory_control_plane"]["label"] == "Memory Control Plane" + edge = G.edges["memory_control_plane", "dex_system"] + assert edge["relation"] == "conceptually_related_to" + assert edge["source_file"] == "unknown" + assert edge["confidence_score"] == 0.9 + assert "confience_score" not in edge + + def test_source_file_backslash_normalized(): """Windows backslash paths and POSIX paths for the same file must produce one node.""" extraction = { diff --git a/tests/test_export.py b/tests/test_export.py index 832c87073..df00f4ec9 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -38,6 +38,39 @@ def test_to_json_nodes_have_community(): for node in data["nodes"]: assert "community" in node + +def test_to_json_canonicalizes_salvageable_schema_fields(): + G = build_from_json({ + "nodes": [ + {"id": "memory_control_plane", "file_type": "document", "source_file": "a.md"}, + {"id": "dex_system", "label": "Dex System", "file_type": "document", "source_file": "b.md"}, + ], + "edges": [ + { + "source": "memory_control_plane", + "target": "dex_system", + "confidence": "INFERRED", + "confidence_score": 0.9, + "confience_score": 0.8, + "source_file": "", + } + ], + "input_tokens": 0, + "output_tokens": 0, + }) + with tempfile.TemporaryDirectory() as tmp: + out = Path(tmp) / "graph.json" + to_json(G, {0: list(G.nodes)}, str(out), force=True) + data = json.loads(out.read_text()) + node = next(n for n in data["nodes"] if n["id"] == "memory_control_plane") + link = data["links"][0] + assert node["label"] == "Memory Control Plane" + assert link["relation"] == "conceptually_related_to" + assert link["source_file"] == "unknown" + assert link["confidence_score"] == 0.9 + assert "confience_score" not in link + + def test_to_cypher_creates_file(): G = make_graph() with tempfile.TemporaryDirectory() as tmp: diff --git a/tests/test_quality.py b/tests/test_quality.py new file mode 100644 index 000000000..4042875ea --- /dev/null +++ b/tests/test_quality.py @@ -0,0 +1,58 @@ +import json + +from graphify.quality import format_report, inspect_graph + + +def test_quality_passes_clean_graph(tmp_path): + graph = { + "nodes": [{"id": "a", "label": "A", "source_file": "a.md"}], + "links": [], + } + path = tmp_path / "graph.json" + path.write_text(json.dumps(graph), encoding="utf-8") + report = inspect_graph(path) + assert report["status"] == "pass" + assert report["total_issues"] == 0 + + +def test_quality_reports_schema_defects(tmp_path): + graph = { + "nodes": [ + {"id": "a", "source_file": ""}, + "bad", + {"id": "a", "label": "Duplicate", "source_file": "a.md"}, + ], + "links": [ + {"source": "a", "target": "missing", "confidence": "INFERRED", "confience_score": 0.8}, + "bad", + ], + } + path = tmp_path / "graph.json" + path.write_text(json.dumps(graph), encoding="utf-8") + report = inspect_graph(path) + assert report["status"] == "fail" + assert report["issues"]["non_object_nodes"] == 1 + assert report["issues"]["non_object_edges"] == 1 + assert report["issues"]["missing_node_labels"] == 1 + assert report["issues"]["missing_node_source_files"] == 1 + assert report["issues"]["missing_edge_relations"] == 1 + assert report["issues"]["missing_edge_source_files"] == 1 + assert report["issues"]["typo_confience_score_edges"] == 1 + assert report["issues"]["duplicate_node_ids"] == 1 + assert report["issues"]["dangling_edge_endpoints"] == 1 + assert "Graph quality: fail" in format_report(report) + + +def test_quality_reports_non_list_graph_fields(tmp_path): + graph = { + "nodes": {"id": "not_a_list"}, + "links": {"source": "not_a_list"}, + } + path = tmp_path / "graph.json" + path.write_text(json.dumps(graph), encoding="utf-8") + report = inspect_graph(path) + assert report["status"] == "fail" + assert report["nodes"] == 0 + assert report["edges"] == 0 + assert report["issues"]["non_object_nodes"] == 1 + assert report["issues"]["non_object_edges"] == 1