Skip to content
Draft
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
f378055
feat(codegraph): content-addressed code retrieval engine + agent tool…
sanil-23 May 26, 2026
0b185b4
feat(skills): skill input + definition types for the registry (D2 par…
sanil-23 May 26, 2026
127cd61
feat(skills): registry loader + skills_run background RPC (D2/D3)
sanil-23 May 26, 2026
768d1b0
feat(skills): run skills as the orchestrator agent guided by SKILL.md
sanil-23 May 26, 2026
bf3add4
style(codegraph,skills): apply rustfmt to feat-branch files
sanil-23 May 27, 2026
24d1a55
perf(codegraph): batch embeds + single-transaction blob inserts
sanil-23 May 27, 2026
cb07fc1
fix(codegraph): never send empty structural docs to the embedder
sanil-23 May 27, 2026
fcebf55
test(codegraph): index_e2e_cloud tolerates Partial coverage
sanil-23 May 27, 2026
49460be
feat(codegraph): size-gated index modes + synchronous index-first
sanil-23 May 27, 2026
ee55449
feat(skills): skill_run runs the orchestrator + streams every step to…
sanil-23 May 27, 2026
697e930
feat(skills): ship github-issue-crusher as a bundled default skill
sanil-23 May 27, 2026
6e66acb
feat(skills): seed bundled default skills at core boot
sanil-23 May 27, 2026
0078c7b
feat(skills): make github-issue-crusher fork-aware (cross-repo PR)
sanil-23 May 27, 2026
54d3a90
feat(skills): autonomous skill runs — lifted iteration cap + full web
sanil-23 May 27, 2026
fd75e55
Merge branch 'feat/codegraph-skills' of https://github.com/sanil-23/o…
graycyrus May 27, 2026
aaf8b31
feat(skills): tighten github-issue-crusher SKILL.md — delegation disc…
sanil-23 May 27, 2026
75271bf
feat(skills): code_executor navigates codegraph-first
sanil-23 May 27, 2026
ebfbd05
feat(dev-workflow): wire config to cron + bundled skill + execution UI
graycyrus May 28, 2026
e8f6c2f
test(dev-workflow): update panel tests for cron RPC instead of localS…
graycyrus May 28, 2026
ec01a6d
test(dev-workflow): add coverage for toggle, run now, history, and er…
graycyrus May 28, 2026
e9a04b7
feat(skills): issue-crusher uses local git+gh, opens DRAFT PR, pins i…
sanil-23 May 28, 2026
296bb8e
fix(skills): isolate skill_run transcripts so resume can never poison
sanil-23 May 28, 2026
d47d5fc
skills(issue-crusher): name delegate_run_code explicitly per step
sanil-23 May 28, 2026
c918092
skills(issue-crusher): make codegraph_search mandatory in step 5
sanil-23 May 28, 2026
c068d26
orchestrator: route ALL code-repo work to delegate_run_code; strip SK…
sanil-23 May 28, 2026
f389a31
agents: tighten when_to_use for code_executor + tools_agent so the LL…
sanil-23 May 28, 2026
3e90d05
skills: reject degenerate-response final messages; bind code_executor…
sanil-23 May 28, 2026
2e36b17
skills: add pr-review-shepherd — Phase-6 PR-to-mergeable shepherd
sanil-23 May 28, 2026
815b499
skills: add run_skill orchestrator tool for skill chaining (issue-cru…
sanil-23 May 28, 2026
ec9a576
Merge graycyrus/feat/dev-workflow-full into feat/codegraph-skills
sanil-23 May 28, 2026
1f875ac
skills: add openhuman.skills_describe RPC for FE skill picker
sanil-23 May 28, 2026
14ac178
frontend: SkillsRunnerPanel — pick any bundled skill, render inputs, …
sanil-23 May 28, 2026
8594e7c
skills: add openhuman.skills_recent_runs RPC + scan_runs parser
sanil-23 May 28, 2026
54d3448
fix(skills): clean up scan_runs parser — split on first colon
sanil-23 May 28, 2026
dc4b473
frontend: SkillsRunnerPanel gains cron scheduling + recent runs viewer
sanil-23 May 28, 2026
9200e75
skills: in-app log viewer — chat-like inline expand of any run's stre…
sanil-23 May 28, 2026
e49c93d
frontend: promote Skills Runner to /skills as new 'Runners' tab
sanil-23 May 28, 2026
4363539
tauri(cef): honor OPENHUMAN_CEF_NO_SANDBOX=1 to launch on non-root Li…
sanil-23 May 28, 2026
91397c7
frontend: rich Composio pickers for repo/branch inputs in SkillsRunne…
sanil-23 May 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
829 changes: 829 additions & 0 deletions src/openhuman/codegraph/index.rs

Large diffs are not rendered by default.

29 changes: 29 additions & 0 deletions src/openhuman/codegraph/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
//! codegraph — content-addressed code retrieval for coding subagents.
//!
//! The seed engine behind the issue-crusher / pr-reviewer skills. Retrieval is
//! `BM25 (SQLite FTS5) ∪ structural-aug dense (embeddings domain)`, RRF-fused.
//! Indexing is content-addressed: every file's `{tokens, struct-doc embedding}`
//! is cached by its git **blob SHA** (+ embedding-model signature); a branch's
//! index is just its per-`(repo, ref)` **manifest** rows joined to the shared
//! blob cache at query time. Branch switch / new commit / pull only (re)embed
//! the blobs that actually changed.
//!
//! Pure Rust: `tree-sitter` for structure, `rusqlite`+FTS5 for lexical, and the
//! `embeddings` domain (cloud by default) for vectors. No Python, no extra
//! services.
//!
//! Layers:
//! - [`store`] — persistent SQLite blob cache + manifests (this commit).
//! - `index` — tree-sitter extract + FTS5 + dense, incremental (next).
//! - `search` — BM25 ∪ dense RRF + coverage flag (next).

pub mod index;
pub mod search;
pub mod store;

pub use index::{
code_tokens, count_code_files, current_ref, index_ref, structural_doc, IndexMode, IndexReport,
LEXICAL_MODEL,
};
pub use search::{search_ref, Coverage, SearchOutcome};
pub use store::{BlobEntry, CodegraphStore};
289 changes: 289 additions & 0 deletions src/openhuman/codegraph/search.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
//! Retrieval: the seed. Hydrate a `(repo, ref)` working set from the store,
//! score it with **BM25 (lexical) ∪ dense (cosine)**, **RRF-fuse**, and report
//! a **coverage** flag (`full`/`partial`/`none`) so callers know whether the
//! index is complete or the agent should lean on grep.
//!
//! BM25 is in-memory over the hydrated tokens (the working set is one repo's
//! files — small; this matches the validated prototype and keeps the
//! hydrate-per-query model simple). Dense is cosine over the L2-normalised
//! structural-aug vectors. The query is embedded once with the same provider
//! the index was built with (its `signature()` is the cache `model` key).

use std::collections::{HashMap, HashSet};

use anyhow::{Context, Result};

use crate::openhuman::embeddings::EmbeddingProvider;

use super::index::code_tokens;
use super::store::{BlobEntry, CodegraphStore};

const RRF_K: f32 = 60.0;
const PER_ARM: usize = 20; // top-N from each arm fed into RRF
const BM25_K1: f32 = 1.5;
const BM25_B: f32 = 0.75;

/// How complete the index is for the queried `(repo, ref)`.
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum Coverage {
/// Every manifest file is embedded — trust the candidates.
Full,
/// Some files still pending (background index in flight) — treat as hints.
Partial,
/// Nothing indexed yet — fall back to grep.
None,
}

/// The seed result: ranked candidate paths + how complete the index was.
#[derive(Debug, Clone, serde::Serialize)]
pub struct SearchOutcome {
pub hits: Vec<String>,
pub coverage: Coverage,
/// Files embedded (hydrated) vs total in the manifest.
pub indexed: usize,
pub total: usize,
}

fn l2_normalize(v: &mut [f32]) {
let norm = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[minor] l2_normalize is identical to the one in index.rs (line 214). Extract it to a shared location in the module.

for x in v.iter_mut() {
*x /= norm;
}
}
}

/// BM25-Okapi over the hydrated docs; returns doc indices ranked best-first.
fn bm25_rank(docs: &[BlobEntry], query: &[String]) -> Vec<usize> {
let n = docs.len() as f32;
let lens: Vec<f32> = docs.iter().map(|d| d.tokens.len() as f32).collect();
let avgdl = (lens.iter().sum::<f32>() / n).max(1.0);
// per-doc term frequency tables
let tfs: Vec<HashMap<&str, f32>> = docs
.iter()
.map(|d| {
let mut m: HashMap<&str, f32> = HashMap::new();
for w in &d.tokens {
*m.entry(w.as_str()).or_insert(0.0) += 1.0;
}
m
})
.collect();
let q_terms: HashSet<&str> = query.iter().map(|s| s.as_str()).collect();

let mut scores = vec![0.0f32; docs.len()];
for &t in &q_terms {
let df = tfs.iter().filter(|m| m.contains_key(t)).count() as f32;
if df == 0.0 {
continue;
}
let idf = (((n - df + 0.5) / (df + 0.5)) + 1.0).ln();
for (i, m) in tfs.iter().enumerate() {
if let Some(&f) = m.get(t) {
let denom = f + BM25_K1 * (1.0 - BM25_B + BM25_B * lens[i] / avgdl);
scores[i] += idf * (f * (BM25_K1 + 1.0)) / denom;
}
}
}
rank_by_score(&scores)
}

/// Cosine (dot over normalised vectors) of `qv` against each doc; best-first.
fn dense_rank(docs: &[BlobEntry], qv: &[f32]) -> Vec<usize> {
let scores: Vec<f32> = docs
.iter()
.map(|d| d.emb.iter().zip(qv).map(|(a, b)| a * b).sum::<f32>())
.collect();
rank_by_score(&scores)
}

fn rank_by_score(scores: &[f32]) -> Vec<usize> {
let mut idx: Vec<usize> = (0..scores.len()).collect();
idx.sort_by(|&a, &b| {
scores[b]
.partial_cmp(&scores[a])
.unwrap_or(std::cmp::Ordering::Equal)
});
idx
}

/// Reciprocal-rank fusion of several rankings (top-`PER_ARM` of each), top-`k`.
fn rrf(rankings: &[Vec<usize>], k: usize) -> Vec<usize> {
let mut score: HashMap<usize, f32> = HashMap::new();
for ranking in rankings {
for (rank, &doc) in ranking.iter().take(PER_ARM).enumerate() {
*score.entry(doc).or_insert(0.0) += 1.0 / (RRF_K + rank as f32 + 1.0);
}
}
let mut items: Vec<(usize, f32)> = score.into_iter().collect();
items.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
items.into_iter().take(k).map(|(i, _)| i).collect()
}

/// Seed `query` against a `(repo, ref)` index: BM25 ∪ dense, RRF-fused, top-`k`,
/// with a coverage flag. Embeds the query once with `embedder`.
pub async fn search_ref(
store: &mut CodegraphStore,
repo_id: &str,
git_ref: &str,
query: &str,
embedder: &dyn EmbeddingProvider,
k: usize,
) -> Result<SearchOutcome> {
let total = store.manifest_size(repo_id, git_ref)?;
// Auto-detect the index mode: prefer the dense arm (rows under the
// embedder's signature); if none, fall back to the lexical-only key (a
// small repo indexed BM25-only). Lexical search makes no embedder call.
let dense_model = embedder.signature();
let mut docs = store.hydrate(repo_id, git_ref, &dense_model)?;
let dense_active = !docs.is_empty();
if !dense_active {
docs = store.hydrate(repo_id, git_ref, super::index::LEXICAL_MODEL)?;
}

let coverage = if total == 0 {
Coverage::None
} else if docs.len() >= total {
Coverage::Full
} else {
Coverage::Partial
};
if docs.is_empty() {
return Ok(SearchOutcome {
hits: vec![],
coverage,
indexed: 0,
total,
});
}

let q_tokens = code_tokens(query);
let bm = bm25_rank(&docs, &q_tokens);

// Dense arm only when the index has vectors — otherwise BM25 alone, and no
// query-embed round-trip. RRF over a single ranking preserves its order.
let arms = if dense_active {
let mut qv = embedder
.embed(&[query])
.await
.context("codegraph: embed query")?
.into_iter()
.next()
.unwrap_or_default();
l2_normalize(&mut qv);
vec![bm, dense_rank(&docs, &qv)]
} else {
vec![bm]
};

let fused = rrf(&arms, k);
let hits = fused.into_iter().map(|i| docs[i].path.clone()).collect();
Ok(SearchOutcome {
hits,
coverage,
indexed: docs.len(),
total,
})
}

#[cfg(test)]
mod tests {
use super::*;
use async_trait::async_trait;
use tempfile::TempDir;

fn doc(path: &str, toks: &[&str]) -> BlobEntry {
BlobEntry {
path: path.into(),
tokens: toks.iter().map(|s| s.to_string()).collect(),
emb: vec![0.0, 0.0, 0.0],
}
}

#[test]
fn bm25_ranks_the_matching_doc_first() {
let docs = vec![
doc("auth.rs", &["login", "session", "token"]),
doc("retry.rs", &["reconcile", "backoff", "charge"]),
doc("util.rs", &["helper", "misc"]),
];
let ranked = bm25_rank(&docs, &code_tokens("reconcile after backoff"));
assert_eq!(ranked[0], 1, "retry.rs ranks first for 'reconcile/backoff'");
}

#[test]
fn rrf_blends_two_rankings() {
// bm25 likes doc 2, dense likes doc 0; both should surface above doc 1.
let fused = rrf(&[vec![2, 1, 0], vec![0, 1, 2]], 3);
assert!(fused.contains(&0) && fused.contains(&2));
assert_eq!(fused.len(), 3);
}

struct FakeEmbedder;
#[async_trait]
impl EmbeddingProvider for FakeEmbedder {
fn name(&self) -> &str {
"fake"
}
fn model_id(&self) -> &str {
"fake-1"
}
fn dimensions(&self) -> usize {
3
}
async fn embed(&self, texts: &[&str]) -> anyhow::Result<Vec<Vec<f32>>> {
Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
}
}

#[tokio::test]
async fn search_ref_returns_ranked_hits_and_partial_coverage() {
let tmp = TempDir::new().unwrap();
let mut store = CodegraphStore::open(&tmp.path().join("cg.db")).unwrap();
let sig = FakeEmbedder.signature();
store
.put_blob(
"a",
&sig,
&["reconcile".into(), "backoff".into()],
&[1.0, 0.0, 0.0],
)
.unwrap();
store
.put_blob(
"b",
&sig,
&["login".into(), "token".into()],
&[0.0, 1.0, 0.0],
)
.unwrap();
// manifest has a 3rd file with no cached blob → partial coverage.
store
.set_manifest(
"r",
"main",
&[
("retry.rs".into(), "a".into()),
("auth.rs".into(), "b".into()),
("pending.rs".into(), "uncached".into()),
],
)
.unwrap();

let out = search_ref(
&mut store,
"r",
"main",
"reconcile backoff",
&FakeEmbedder,
10,
)
.await
.unwrap();
assert_eq!(out.coverage, Coverage::Partial);
assert_eq!(out.indexed, 2);
assert_eq!(out.total, 3);
assert_eq!(out.hits[0], "retry.rs", "lexical match surfaces first");
}
}
Loading
Loading