Skip to content

Commit

Permalink
feat(answer): add commit to answer engine prompt
Browse files Browse the repository at this point in the history
Signed-off-by: Wei Zhang <[email protected]>
  • Loading branch information
zwpaper committed Feb 25, 2025
1 parent 4fce8e5 commit edcb47f
Show file tree
Hide file tree
Showing 7 changed files with 331 additions and 97 deletions.
140 changes: 100 additions & 40 deletions crates/tabby-common/src/api/commit.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
use crate::index::{commit::fields, IndexSchema};

use async_trait::async_trait;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use chrono::{DateTime, TimeZone, Utc};
use tantivy::{
schema::{self, document::CompactDocValue, Value},
DateTime as TantivyDateTime, TantivyDocument,
};

use super::Result;

Expand All @@ -13,51 +18,17 @@ pub trait CommitHistorySearch: Send + Sync {
&self,
source_id: &str,
q: &str,
params: &CommitHistorySearchParams,
limit: usize,
) -> Result<CommitHistorySearchResponse>;
}

#[derive(Default, Clone, PartialEq, Debug)]
pub struct CommitHistorySearchScores {
/// Reciprocal rank fusion score: https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html
pub rrf: f32,
pub bm25: f32,
pub embedding: f32,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct CommitHistorySearchParams {
pub min_embedding_score: f32,
pub min_bm25_score: f32,
pub min_rrf_score: f32,

/// At most num_to_return results will be returned.
pub num_to_return: usize,

/// At most num_to_score results will be scored.
pub num_to_score: usize,
}

impl Default for CommitHistorySearchParams {
fn default() -> Self {
Self {
min_embedding_score: 0.75,
min_bm25_score: 8.0,
min_rrf_score: 0.028,

num_to_return: 20,
num_to_score: 40,
}
}
}

pub struct CommitHistorySearchResponse {
pub hits: Vec<CommitHistorySearchHit>,
}

#[derive(Clone, Debug)]
pub struct CommitHistorySearchHit {
pub scores: CommitHistorySearchScores,
pub score: f32,
pub commit: CommitHistoryDocument,
}

Expand All @@ -66,12 +37,101 @@ pub struct CommitHistoryDocument {
pub git_url: String,
pub sha: String,
pub message: String,
//TODO(kweizh): should we add branches for commit?
// pub branches: Vec<String>,
pub author_email: String,
pub author_at: DateTime<Utc>,
pub committer: String,
pub commit_at: DateTime<Utc>,

pub diff: Option<String>,
pub changed_file: Option<String>,
}

impl CommitHistoryDocument {
pub fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option<Self> {
let schema = IndexSchema::instance();
let git_url =
get_json_text_field(doc, schema.field_attributes, fields::GIT_URL).to_string();
let sha = get_json_text_field(doc, schema.field_attributes, fields::SHA).to_string();
let message =
get_json_text_field(doc, schema.field_attributes, fields::MESSAGE).to_string();
let author_email =
get_json_text_field(doc, schema.field_attributes, fields::AUTHOR_EMAIL).to_string();
let author_at = get_json_date_field(doc, schema.field_attributes, fields::AUTHOR_AT)
.unwrap()
.into_timestamp_secs();
let committer =
get_json_text_field(doc, schema.field_attributes, fields::COMMITTER).to_string();
let commit_at = get_json_date_field(doc, schema.field_attributes, fields::COMMIT_AT)
.unwrap()
.into_timestamp_secs();
let diff =
get_json_option_text_field(chunk, schema.field_chunk_attributes, fields::CHUNK_DIFF)
.map(|s| s.to_string());
let changed_file = get_json_option_text_field(
chunk,
schema.field_chunk_attributes,
fields::CHUNK_FILEPATH,
)
.map(|s| s.to_string());

Some(Self {
git_url,
sha,
message,
author_email,
author_at: Utc.timestamp_opt(author_at, 0).single().unwrap_or_default(),
committer,
commit_at: Utc.timestamp_opt(commit_at, 0).single().unwrap_or_default(),

diff,
changed_file,
})
}
}

fn get_json_field<'a>(
doc: &'a TantivyDocument,
field: schema::Field,
name: &str,
) -> CompactDocValue<'a> {
doc.get_first(field)
.unwrap()
.as_object()
.unwrap()
.find(|(k, _)| *k == name)
.unwrap()
.1
}

fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str {
get_json_field(doc, field, name).as_str().unwrap()
}

fn get_json_option_field<'a>(
doc: &'a TantivyDocument,
field: schema::Field,
name: &str,
) -> Option<CompactDocValue<'a>> {
Some(
doc.get_first(field)?
.as_object()?
.find(|(k, _)| *k == name)?
.1,
)
}

fn get_json_date_field<'a>(
doc: &'a TantivyDocument,
field: schema::Field,
name: &str,
) -> Option<TantivyDateTime> {
get_json_option_field(doc, field, name).and_then(|field| field.as_datetime())
}

fn get_json_option_text_field<'a>(
doc: &'a TantivyDocument,
field: schema::Field,
name: &str,
) -> Option<&'a str> {
get_json_option_field(doc, field, name).and_then(|field| field.as_str())
}
136 changes: 129 additions & 7 deletions crates/tabby/src/services/commit/tantivy.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,29 @@
use std::{collections::HashSet, sync::Arc};

use async_trait::async_trait;
use std::sync::Arc;
use tabby_common::api::{
commit::{CommitHistorySearch, CommitHistorySearchParams, CommitHistorySearchResponse},
Result, SearchError,
use tabby_common::{
api::{
commit::{
CommitHistoryDocument, CommitHistorySearch, CommitHistorySearchHit,
CommitHistorySearchResponse,
},
Result, SearchError,
},
index::{self, corpus},
};
use tabby_inference::Embedding;
use tantivy::{
collector::TopDocs,
query::{BooleanQuery, ConstScoreQuery, Occur, Query},
schema::{self, Value},
IndexReader, TantivyDocument,
};
use tracing::warn;

use crate::services::tantivy::IndexReaderProvider;

const EMBEDDING_SCORE_THRESHOLD: f32 = 0.75;

pub struct CommitHistorySearchImpl {
embedding: Arc<dyn Embedding>,
provider: Arc<IndexReaderProvider>,
Expand All @@ -29,9 +45,115 @@ impl CommitHistorySearch for CommitHistorySearchImpl {
&self,
source_id: &str,
content: &str,
params: &CommitHistorySearchParams,
limit: usize,
) -> Result<CommitHistorySearchResponse> {
//TODO(kweizh)
Err(SearchError::NotReady)
if let Some(reader) = self.provider.reader().await.as_ref() {
self.index_search(reader, source_id, content, limit).await
} else {
Err(SearchError::NotReady)
}
}
}

struct ScoredChunk {
doc_id: String,
score: f32,
chunk: TantivyDocument,
}

impl CommitHistorySearchImpl {
async fn index_search(
&self,
reader: &IndexReader,
source_id: &str,
content: &str,
limit: usize,
) -> Result<CommitHistorySearchResponse> {
let schema = index::IndexSchema::instance();
let query = {
let embedding = self.embedding.embed(content).await?;
let embedding_tokens_query =
index::embedding_tokens_query(embedding.len(), embedding.iter());
let corpus_query = schema.corpus_query(corpus::COMMIT_HISTORY);

let mut query_clauses: Vec<(Occur, Box<dyn Query>)> = vec![
(
Occur::Must,
Box::new(ConstScoreQuery::new(corpus_query, 0.0)),
),
(Occur::Must, Box::new(embedding_tokens_query)),
];

if !source_id.is_empty() {
let source_id_query = Box::new(schema.source_id_query(source_id));
let source_id_query = ConstScoreQuery::new(source_id_query, 0.0);
query_clauses.push((Occur::Must, Box::new(source_id_query)));
}
BooleanQuery::new(query_clauses)
};

let searcher = reader.searcher();
let top_chunks = searcher.search(&query, &TopDocs::with_limit(limit * 2))?;

let chunks = {
// Extract all chunks.
let mut chunks: Vec<_> = top_chunks
.iter()
.filter_map(|(score, chunk_address)| {
let chunk: TantivyDocument = searcher.doc(*chunk_address).ok()?;
let doc_id = get_text(&chunk, schema.field_id).to_owned();
Some(ScoredChunk {
score: *score,
chunk,
doc_id,
})
})
.collect();

// Sort by score in descending order.
chunks.sort_unstable_by(|lhs, rhs| rhs.score.total_cmp(&lhs.score));

// Deduplicate by doc_id.
let mut doc_ids = HashSet::new();
chunks.retain(|x| doc_ids.insert(x.doc_id.clone()));

chunks
};

let hits = chunks
.iter()
.filter_map(
|ScoredChunk {
doc_id,
score,
chunk,
}| {
let doc_query = schema.doc_query(corpus::COMMIT_HISTORY, doc_id);
let top_docs = match searcher.search(&doc_query, &TopDocs::with_limit(1)) {
Err(err) => {
warn!("Failed to search doc `{}`: `{}`", doc_id, err);
return None;
}
Ok(top_docs) => top_docs,
};
let (_, doc_address) = top_docs.first()?;
let doc: TantivyDocument = searcher.doc(*doc_address).ok()?;
CommitHistoryDocument::from_tantivy_document(&doc, chunk).map(|commit| {
CommitHistorySearchHit {
score: *score,
commit,
}
})
},
)
.filter(|x| x.score >= EMBEDDING_SCORE_THRESHOLD)
.take(limit)
.collect();

Ok(CommitHistorySearchResponse { hits })
}
}

fn get_text(doc: &TantivyDocument, field: schema::Field) -> &str {
doc.get_first(field).unwrap().as_str().unwrap()
}
3 changes: 3 additions & 0 deletions ee/tabby-db/src/attachment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ pub struct AttachmentCommit {
pub author_at: DateTime<Utc>,
pub committer_email: String,
pub commit_at: DateTime<Utc>,

pub diff: Option<String>,
pub changed_file: Option<String>,
}

#[derive(Serialize, Deserialize)]
Expand Down
32 changes: 17 additions & 15 deletions ee/tabby-schema/src/dao.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ use anyhow::bail;
use hash_ids::HashIds;
use lazy_static::lazy_static;
use tabby_db::{
AttachmentClientCode, AttachmentCode, AttachmentCodeFileList, AttachmentDoc,
AttachmentClientCode, AttachmentCode, AttachmentCodeFileList, AttachmentCommit, AttachmentDoc,
AttachmentIssueDoc, AttachmentPullDoc, AttachmentWebDoc, EmailSettingDAO, IntegrationDAO,
InvitationDAO, JobRunDAO, LdapCredentialDAO, NotificationDAO, OAuthCredentialDAO, PageDAO,
PageSectionDAO, ServerSettingDAO, ThreadDAO, UserEventDAO, AttachmentCommit,
PageSectionDAO, ServerSettingDAO, ThreadDAO, UserEventDAO,
};

use crate::{
Expand Down Expand Up @@ -220,19 +220,21 @@ impl From<NotificationDAO> for Notification {
}
}

//TODO(kweizh)
impl From<AttachmentCommit> for thread::MessageAttachmentCommit {
fn from(value: AttachmentCommit) -> Self {
Self {
git_url: value.git_url,
sha: value.sha,
message: value.message,
author: None,
author_at: value.author_at,
committer: None,
commit_at: value.commit_at,
diff: None,
}
pub fn from_attachment_commit_history(
value: AttachmentCommit,
author: Option<UserValue>,
committer: Option<UserValue>,
) -> thread::MessageAttachmentCommit {
thread::MessageAttachmentCommit {
git_url: value.git_url,
sha: value.sha,
message: value.message,
author,
author_at: value.author_at,
committer,
commit_at: value.commit_at,
diff: value.diff,
changed_file: value.changed_file,
}
}

Expand Down
Loading

0 comments on commit edcb47f

Please sign in to comment.