safishamsi · jefferdr-rgb · May 19, 2026
diff --git a/README.md b/README.md
@@ -323,6 +323,7 @@ These are only needed for **headless / CI extraction** (`graphify extract`). Whe
 | `GEMINI_API_KEY` or `GOOGLE_API_KEY` | Google Gemini backend | `--backend gemini` |
 | `OPENAI_API_KEY` | OpenAI or OpenAI-compatible APIs | `--backend openai` |
 | `DEEPSEEK_API_KEY` | DeepSeek backend | `--backend deepseek` |
+| `OPENROUTER_API_KEY` | OpenRouter DeepSeek/Kimi backends | default for extraction via `openrouter-deepseek`; explicit `--backend openrouter-deepseek` or `--backend openrouter-kimi` |
 | `MOONSHOT_API_KEY` | Kimi Code backend | `--backend kimi` |
 | `OLLAMA_BASE_URL` | Ollama local inference URL | `--backend ollama` (default: `http://localhost:11434`) |
 | `OLLAMA_MODEL` | Ollama model name | `--backend ollama` (default: auto-detect) |
@@ -343,7 +344,7 @@ These are only needed for **headless / CI extraction** (`graphify extract`). Whe
 
 - **Code files** — processed locally via tree-sitter. Nothing leaves your machine.
 - **Video / audio** — transcribed locally with faster-whisper. Nothing leaves your machine.
-- **Docs, PDFs, images** — sent to your AI assistant for semantic extraction (via the `/graphify` skill, using whatever model your IDE session runs). Headless `graphify extract` requires `GEMINI_API_KEY` / `GOOGLE_API_KEY` (Gemini), `MOONSHOT_API_KEY` (Kimi), `ANTHROPIC_API_KEY` (Claude), `OPENAI_API_KEY` (OpenAI), `DEEPSEEK_API_KEY` (DeepSeek), a running Ollama instance (`OLLAMA_BASE_URL`), AWS credentials via the standard provider chain (Bedrock - no API key needed, uses IAM), or the `claude` CLI binary (Claude Code - no API key needed, uses your Claude subscription). The `--dedup-llm` flag uses the same key.
+- **Docs, PDFs, images** — sent to your AI assistant for semantic extraction (via the `/graphify` skill, using whatever model your IDE session runs). Headless `graphify extract` defaults to OpenRouter DeepSeek when `OPENROUTER_API_KEY` is set, or can use `DEEPSEEK_API_KEY` (DeepSeek), `GEMINI_API_KEY` / `GOOGLE_API_KEY` (Gemini), `MOONSHOT_API_KEY` (Kimi), `ANTHROPIC_API_KEY` (Claude), `OPENAI_API_KEY` (OpenAI), a running Ollama instance (`OLLAMA_BASE_URL`), AWS credentials via the standard provider chain (Bedrock - no API key needed, uses IAM), or the `claude` CLI binary (Claude Code - no API key needed, uses your Claude subscription). The `--dedup-llm` flag uses the same key.
 - No telemetry, no usage tracking, no analytics.
 
 ---
@@ -453,7 +454,8 @@ graphify kiro install / uninstall
 graphify antigravity install / uninstall
 
 graphify extract ./docs                        # headless LLM extraction for CI (no IDE needed)
-graphify extract ./docs --backend gemini       # explicit backend: gemini, kimi, claude, openai, deepseek, ollama, bedrock, or claude-cli
+graphify extract ./docs --backend openrouter-deepseek  # explicit OpenRouter DeepSeek backend (default when OPENROUTER_API_KEY is set)
+graphify extract ./docs --backend gemini       # explicit backend: openrouter-deepseek, openrouter-kimi, deepseek, gemini, kimi, claude, openai, ollama, bedrock, or claude-cli
 graphify extract ./docs --backend gemini --model gemini-3.1-pro-preview
 graphify extract ./docs --backend ollama       # local Ollama (set OLLAMA_BASE_URL / OLLAMA_MODEL) - no API key needed for loopback
 GRAPHIFY_OLLAMA_NUM_CTX=32768 graphify extract ./docs --backend ollama   # override KV-cache window (auto-sized by default)
@@ -470,6 +472,7 @@ graphify extract ./docs --force                # overwrite graph.json even if ne
 graphify extract ./docs --dedup-llm            # LLM tiebreaker for ambiguous entity pairs (uses same API key)
 graphify extract ./docs --global --as myrepo   # extract and register into the cross-project global graph
 GRAPHIFY_MAX_OUTPUT_TOKENS=32768 graphify extract ./docs --backend claude  # raise output cap for dense corpora
+graphify quality graphify-out/graph.json       # schema-quality gate for generated graph JSON
 
 graphify export callflow-html                       # graphify-out/<project>-callflow.html
 graphify export callflow-html --max-sections 8      # cap generated architecture sections

diff --git a/graphify/__main__.py b/graphify/__main__.py
@@ -1241,6 +1241,8 @@ def main() -> None:
         print("    --context C             explicit edge-context filter (repeatable)")
         print("    --budget N              cap output at N tokens (default 2000)")
         print("    --graph <path>          path to graph.json (default graphify-out/graph.json)")
+        print("  quality [graph.json]     inspect graph.json schema quality")
+        print("    --json                  emit machine-readable quality report")
         print("  save-result             save a Q&A result to graphify-out/memory/ for graph feedback loop")
         print("    --question Q            the question asked")
         print("    --answer A              the answer to save")
@@ -1256,7 +1258,7 @@ def main() -> None:
         print("    --top-k-edges N         per-symbol outbound edges in inspector (default 12)")
         print("    --label NAME            project label in header")
         print("  extract <path>          headless full extraction (AST + semantic LLM) for CI/scripts")
-        print("    --backend B             gemini|kimi|claude|openai|deepseek|ollama (default: whichever API key is set)")
+        print("    --backend B             openrouter-deepseek|openrouter-kimi|deepseek|gemini|kimi|claude|openai|ollama (default: OpenRouter DeepSeek when OPENROUTER_API_KEY is set)")
         print("    --model M               override backend default model")
         print("    --max-workers N         AST extraction subprocess count (default: cpu_count)")
         print("    --token-budget N        per-chunk token cap for semantic extraction (default: 60000)")
@@ -1576,6 +1578,26 @@ def main() -> None:
             source_nodes=opts.nodes or None,
         )
         print(f"Saved to {out}")
+    elif cmd == "quality":
+        from graphify.quality import format_report, inspect_graph
+        graph_path = Path(_default_graph_path())
+        emit_json = False
+        for arg in sys.argv[2:]:
+            if arg == "--json":
+                emit_json = True
+            else:
+                graph_path = Path(arg)
+        try:
+            report = inspect_graph(graph_path)
+        except Exception as exc:
+            print(f"error: could not inspect graph quality: {exc}", file=sys.stderr)
+            sys.exit(1)
+        if emit_json:
+            print(json.dumps(report, indent=2))
+        else:
+            print(format_report(report))
+        if report["status"] != "pass":
+            sys.exit(1)
     elif cmd == "path":
         if len(sys.argv) < 4:
             print("Usage: graphify path \"<source>\" \"<target>\" [--graph path]", file=sys.stderr)
@@ -2402,7 +2424,7 @@ def _load_graph(p: str):
         # has an API key set.
         if len(sys.argv) < 3:
             print(
-                "Usage: graphify extract <path> [--backend gemini|kimi|claude|openai|deepseek|ollama] "
+                "Usage: graphify extract <path> [--backend openrouter-deepseek|openrouter-kimi|deepseek|gemini|kimi|claude|openai|ollama] "
                 "[--model M] [--out DIR] [--google-workspace] [--no-cluster] "
                 "[--max-workers N] [--token-budget N] [--max-concurrency N] "
                 "[--api-timeout S]",
@@ -2525,14 +2547,16 @@ def _parse_float(name: str, raw: str) -> float:
             extract_corpus_parallel as _extract_corpus_parallel,
             _format_backend_env_keys,
             _get_backend_api_key,
+            _sanitize_extraction_result,
         )
         if backend is None:
             backend = _detect_backend()
             if backend is None:
                 print(
-                    "error: no LLM API key found. Set GEMINI_API_KEY or GOOGLE_API_KEY "
-                    "(gemini), MOONSHOT_API_KEY (kimi), ANTHROPIC_API_KEY (claude), "
-                    "OPENAI_API_KEY (openai), DEEPSEEK_API_KEY (deepseek), "
+                    "error: no LLM API key found. Set OPENROUTER_API_KEY "
+                    "(default openrouter-deepseek), DEEPSEEK_API_KEY (deepseek), "
+                    "GEMINI_API_KEY or GOOGLE_API_KEY (gemini), MOONSHOT_API_KEY (kimi), "
+                    "ANTHROPIC_API_KEY (claude), OPENAI_API_KEY (openai), "
                     "or pass --backend.",
                     file=sys.stderr,
                 )
@@ -2677,11 +2701,16 @@ def _parse_float(name: str, raw: str) -> float:
             cached_nodes, cached_edges, cached_hyperedges, uncached_paths = (
                 _check_semantic_cache(sem_paths_str, root=target)
             )
+            cached_fragment = _sanitize_extraction_result({
+                "nodes": cached_nodes,
+                "edges": cached_edges,
+                "hyperedges": cached_hyperedges,
+            })
             sem_cache_hits = len(semantic_files) - len(uncached_paths)
             sem_cache_misses = len(uncached_paths)
-            sem_result["nodes"].extend(cached_nodes)
-            sem_result["edges"].extend(cached_edges)
-            sem_result["hyperedges"].extend(cached_hyperedges)
+            sem_result["nodes"].extend(cached_fragment["nodes"])
+            sem_result["edges"].extend(cached_fragment["edges"])
+            sem_result["hyperedges"].extend(cached_fragment["hyperedges"])
             if sem_cache_hits:
                 print(f"[graphify extract] semantic cache: {sem_cache_hits} hit / {sem_cache_misses} miss")
 
@@ -2722,6 +2751,7 @@ def _progress(idx: int, total: int, _result: dict) -> None:
                         file=sys.stderr,
                     )
                     fresh = {"nodes": [], "edges": [], "hyperedges": [], "input_tokens": 0, "output_tokens": 0}
+                fresh = _sanitize_extraction_result(fresh)
                 try:
                     _save_semantic_cache(
                         fresh.get("nodes", []),
@@ -2748,6 +2778,7 @@ def _progress(idx: int, total: int, _result: dict) -> None:
             "input_tokens": ast_result.get("input_tokens", 0) + sem_result.get("input_tokens", 0),
             "output_tokens": ast_result.get("output_tokens", 0) + sem_result.get("output_tokens", 0),
         }
+        merged = _sanitize_extraction_result(merged)
 
         graph_json_path = graphify_out / "graph.json"
         analysis_path = graphify_out / ".graphify_analysis.json"

diff --git a/graphify/build.py b/graphify/build.py
@@ -83,6 +83,18 @@ def _norm_source_file(p: str | None, root: str | None = None) -> str | None:
     return p
 
 
+def _dict_items(value: object) -> list[dict]:
+    """Return only dict entries from a graph list."""
+    if not isinstance(value, list):
+        return []
+    return [item for item in value if isinstance(item, dict)]
+
+
+def _label_from_id(node_id: str) -> str:
+    """Derive a readable fallback label from a node id."""
+    return " ".join(part for part in str(node_id).replace("-", "_").split("_") if part).title()
+
+
 def edge_data(G: nx.Graph, u: str, v: str) -> dict:
     """Return one edge attribute dict for (u, v), tolerating MultiGraph.
 
@@ -112,15 +124,17 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
     root: if given, absolute source_file paths from semantic subagents are made
         relative to root so all nodes share a consistent path key (#932).
     """
+    extraction = dict(extraction)
     _root = str(Path(root).resolve()) if root else None
     # NetworkX <= 3.1 serialised edges as "links"; remap to "edges" for compatibility.
     if "edges" not in extraction and "links" in extraction:
-        extraction = dict(extraction, edges=extraction["links"])
+        extraction["edges"] = extraction["links"]
+
+    for key in ("nodes", "edges", "hyperedges"):
+        extraction[key] = _dict_items(extraction.get(key))
 
     # Canonicalize legacy node/edge schema before validation.
     for node in extraction.get("nodes", []):
-        if not isinstance(node, dict):
-            continue
         if "source" in node and "source_file" not in node:
             # Count edges that reference this node so the warning is actionable (#479)
             node_id = node.get("id", "?")
@@ -135,6 +149,8 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
                 file=sys.stderr,
             )
             node["source_file"] = node.pop("source")
+        if not node.get("label"):
+            node["label"] = _label_from_id(node.get("id", ""))
         # Default missing/None file_type to "concept" so legacy graph.json
         # entries (and stub nodes preserved by `_rebuild_code` from older
         # graphify versions that didn't always populate file_type) don't
@@ -145,6 +161,20 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
         if ft and ft not in {"code", "document", "paper", "image", "rationale", "concept"}:
             node["file_type"] = _FILE_TYPE_SYNONYMS.get(ft, "concept")
 
+    for edge in extraction.get("edges", []):
+        if "confience_score" in edge:
+            typo_score = edge.pop("confience_score")
+            if "confidence_score" not in edge:
+                edge["confidence_score"] = typo_score
+        if "source" not in edge and "from" in edge:
+            edge["source"] = edge["from"]
+        if "target" not in edge and "to" in edge:
+            edge["target"] = edge["to"]
+        if not edge.get("relation"):
+            edge["relation"] = "conceptually_related_to"
+        if not edge.get("source_file"):
+            edge["source_file"] = "unknown"
+
     errors = validate_extraction(extraction)
     # Dangling edges (stdlib/external imports) are expected - only warn about real schema errors.
     real_errors = [e for e in errors if "does not match any node id" not in e]
@@ -161,10 +191,6 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
     # e.g. "Session_ValidateToken" maps to "session_validatetoken".
     norm_to_id: dict[str, str] = {_normalize_id(nid): nid for nid in node_set}
     for edge in extraction.get("edges", []):
-        if "source" not in edge and "from" in edge:
-            edge["source"] = edge["from"]
-        if "target" not in edge and "to" in edge:
-            edge["target"] = edge["to"]
         if "source" not in edge or "target" not in edge:
             continue
         src, tgt = edge["source"], edge["target"]
@@ -214,9 +240,9 @@ def build(
     from graphify.dedup import deduplicate_entities
     combined: dict = {"nodes": [], "edges": [], "hyperedges": [], "input_tokens": 0, "output_tokens": 0}
     for ext in extractions:
-        combined["nodes"].extend(ext.get("nodes", []))
-        combined["edges"].extend(ext.get("edges", []))
-        combined["hyperedges"].extend(ext.get("hyperedges", []))
+        combined["nodes"].extend(_dict_items(ext.get("nodes", [])))
+        combined["edges"].extend(_dict_items(ext.get("edges", [])))
+        combined["hyperedges"].extend(_dict_items(ext.get("hyperedges", [])))
         combined["input_tokens"] += ext.get("input_tokens", 0)
         combined["output_tokens"] += ext.get("output_tokens", 0)
     if dedup and combined["nodes"]:

diff --git a/graphify/cache.py b/graphify/cache.py
@@ -14,6 +14,13 @@
 _GRAPHIFY_OUT = os.environ.get("GRAPHIFY_OUT", "graphify-out")
 
 
+def _dict_items(value: object) -> list[dict]:
+    """Return only dict entries from a cached/extracted graph list."""
+    if not isinstance(value, list):
+        return []
+    return [item for item in value if isinstance(item, dict)]
+
+
 def _body_content(content: bytes) -> bytes:
     """Strip YAML frontmatter from Markdown content, returning only the body."""
     text = content.decode(errors="replace")
@@ -280,9 +287,9 @@ def check_semantic_cache(
             p = Path(root) / p
         result = load_cached(p, root, kind="semantic")
         if result is not None:
-            cached_nodes.extend(result.get("nodes", []))
-            cached_edges.extend(result.get("edges", []))
-            cached_hyperedges.extend(result.get("hyperedges", []))
+            cached_nodes.extend(_dict_items(result.get("nodes", [])))
+            cached_edges.extend(_dict_items(result.get("edges", [])))
+            cached_hyperedges.extend(_dict_items(result.get("hyperedges", [])))
         else:
             uncached.append(fpath)
 
@@ -306,14 +313,20 @@ def save_semantic_cache(
 
     by_file: dict[str, dict] = defaultdict(lambda: {"nodes": [], "edges": [], "hyperedges": []})
     for n in nodes:
+        if not isinstance(n, dict):
+            continue
         src = n.get("source_file", "")
         if src:
             by_file[src]["nodes"].append(n)
     for e in edges:
+        if not isinstance(e, dict):
+            continue
         src = e.get("source_file", "")
         if src:
             by_file[src]["edges"].append(e)
     for h in (hyperedges or []):
+        if not isinstance(h, dict):
+            continue
         src = h.get("source_file", "")
         if src:
             by_file[src]["hyperedges"].append(h)

diff --git a/graphify/export.py b/graphify/export.py
@@ -472,6 +472,11 @@ def _git_head() -> str | None:
         return None
 
 
+def _label_from_id(node_id: str) -> str:
+    """Derive a readable fallback label from a node id."""
+    return " ".join(part for part in str(node_id).replace("-", "_").split("_") if part).title()
+
+
 def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *, force: bool = False, built_at_commit: str | None = None) -> bool:
     # Safety check: refuse to silently shrink an existing graph (#479)
     existing_path = Path(output_path)
@@ -499,9 +504,21 @@ def to_json(G: nx.Graph, communities: dict[int, list[str]], output_path: str, *,
     except TypeError:
         data = json_graph.node_link_data(G)
     for node in data["nodes"]:
+        if not node.get("label"):
+            node["label"] = _label_from_id(node.get("id", ""))
+        if not node.get("source_file"):
+            node["source_file"] = "unknown"
         node["community"] = node_community.get(node["id"])
         node["norm_label"] = _strip_diacritics(node.get("label", "")).lower()
     for link in data["links"]:
+        if "confience_score" in link:
+            typo_score = link.pop("confience_score")
+            if "confidence_score" not in link:
+                link["confidence_score"] = typo_score
+        if not link.get("relation"):
+            link["relation"] = "conceptually_related_to"
+        if not link.get("source_file"):
+            link["source_file"] = "unknown"
         if "confidence_score" not in link:
             conf = link.get("confidence", "EXTRACTED")
             link["confidence_score"] = _CONFIDENCE_SCORE_DEFAULTS.get(conf, 1.0)