Skip to content

Commit

Permalink
code finally compiles
Browse files Browse the repository at this point in the history
Signed-off-by: Zhenbo Li <[email protected]>
  • Loading branch information
Endle committed Aug 19, 2024
1 parent 80c3438 commit 8116608
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 55 deletions.
1 change: 1 addition & 0 deletions fire_seq_search_server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ url = "2.3.1"

# QueryEngine
tantivy = "0.22"
tantivy-tokenizer-api = "0.3.0"
jieba-rs = { version = "0.7.0" }


Expand Down
8 changes: 5 additions & 3 deletions fire_seq_search_server/debug_server.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
set -e
rm -f ./fire_seq_search_server
rm -f ./fire_seq_search_server
# nix-shell -p cargo -p rustc -p libiconv --run "cargo build"
cargo build
cp target/debug/fire_seq_search_server ./fire_seq_search_server

RUST_BACKTRACE=1 RUST_LOG=warn,fire_seq_search_server=info ./fire_seq_search_server \
--notebook_path ~/logseq --enable-journal-query
export RUST_LOG="warn,fire_seq_search_server=info"
export RUST_LOG="debug"
export RUST_BACKTRACE=1
./fire_seq_search_server --notebook_path ~/logseq --enable-journal-query
81 changes: 79 additions & 2 deletions fire_seq_search_server/src/language_tools/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,16 @@ pub fn filter_out_stopwords<'a,'b>(term_tokens: &'a [String], nltk: &'b HashSet<


pub fn tokenize(sentence: &str) -> Vec<String> {
/*
lazy_static! {
static ref TK: crate::JiebaTokenizer = crate::JiebaTokenizer {};
}
*/
if crate::language_tools::is_chinese(sentence) {
info!("Use Tokenizer for Chinese term {}", sentence);
crate::tokenize_sentence_to_text_vec(&TK, sentence)
let mut jieba = FireSeqTokenizer {};
//TODO don't create a tokenizer every time
crate::tokenize_sentence_to_text_vec(&mut jieba, sentence)
} else {
// info!("Space Tokenizer {}", sentence);
let result : Vec<&str> = sentence.split_whitespace()
Expand All @@ -36,4 +40,77 @@ pub fn tokenize(sentence: &str) -> Vec<String> {
result
// vec![String::from(sentence)]
}
}
}

use lazy_static::lazy_static;
use tantivy_tokenizer_api::{Token, TokenStream, Tokenizer};

lazy_static! {
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
}

pub const TOKENIZER_ID: &str = "fireseq_tokenizer";

#[derive(Clone)]
pub struct FireSeqTokenizer;



pub struct JiebaTokenStream {
tokens: Vec<Token>,
index: usize,
}

impl TokenStream for JiebaTokenStream {
fn advance(&mut self) -> bool {
if self.index < self.tokens.len() {
self.index = self.index + 1;
true
} else {
false
}
}
fn token(&self) -> &Token {
&self.tokens[self.index - 1]
}
fn token_mut(&mut self) -> &mut Token {
&mut self.tokens[self.index - 1]
}
}

impl Tokenizer for FireSeqTokenizer {
type TokenStream<'a> = JiebaTokenStream;
fn token_stream<'a>(&mut self, text: &'a str) -> JiebaTokenStream {
let mut indices = text.char_indices().collect::<Vec<_>>();
indices.push((text.len(), '\0'));
let orig_tokens = JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, true);
let mut tokens = Vec::new();
// copy tantivy-jieba code for now
for token in orig_tokens {
tokens.push(Token {
offset_from: indices[token.start].0,
offset_to: indices[token.end].0,
position: token.start,
text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]),
position_length: token.end - token.start,
});
}
/*
for i in 0..orig_tokens.len() {
let token = &orig_tokens[i];
match process_token_text(text, &indices, &token) {
Some(text) => tokens.push(Token {
offset_from: indices[token.start].0,
offset_to: indices[token.end].0,
position: token.start,
text,
position_length: token.end - token.start,
}),
None => ()
}
}
*/
JiebaTokenStream { tokens, index: 0 }
}
}
67 changes: 31 additions & 36 deletions fire_seq_search_server/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,46 +26,23 @@ pub struct Article {
// Based on https://github.com/jiegec/tantivy-jieba
// tantivy-jieba is licensed under MIT, Copyright 2019-2020 Jiajie Chen
// I had heavy modifications on it
/*
lazy_static! {
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
}
*/

pub const TOKENIZER_ID: &str = "fss_tokenizer";

use tantivy::tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer};

pub struct JiebaTokenStream {
tokens: Vec<Token>,
index: usize,
}

//pub const TOKENIZER_ID: &str = "fss_tokenizer";

#[derive(Clone)]
pub struct JiebaTokenizer;

impl TokenStream for JiebaTokenStream {
fn advance(&mut self) -> bool {
if self.index < self.tokens.len() {
self.index = self.index + 1;
true
} else {
false
}
}
fn token(&self) -> &Token {
&self.tokens[self.index - 1]
}

fn token_mut(&mut self) -> &mut Token {
&mut self.tokens[self.index - 1]
}
}

/*
impl Tokenizer for JiebaTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
type TokenStream<'a> = JiebaTokenStream;
fn token_stream<'a>(&mut self, text: &'a str) -> JiebaTokenStream {
let mut indices = text.char_indices().collect::<Vec<_>>();
indices.push((text.len(), '\0'));
let orig_tokens = JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, true);
let jieba : jieba_rs::Jieba = jieba_rs::Jieba::new(); //TODO use a static one
let orig_tokens = jieba.tokenize(text, jieba_rs::TokenizeMode::Search, true);
let mut tokens = Vec::new();
for i in 0..orig_tokens.len() {
let token = &orig_tokens[i];
Expand All @@ -81,9 +58,11 @@ impl Tokenizer for JiebaTokenizer {
}
}
BoxTokenStream::from(JiebaTokenStream { tokens, index: 0 })
JiebaTokenStream { tokens, index: 0 }
}
}
*/

/*
Thoughts on lowercase 2022-07-04:
Expand All @@ -104,14 +83,25 @@ fn process_token_text(text: &str, indices: &Vec<(usize, char)>, token: &jieba_rs
}
}

// TODO use stub now
pub fn tokenize_default(sentence: &str) -> Vec<String> {
let mut r = Vec::new();
r.push(sentence.to_owned());
r
}
/*
// TODO: Move tokenizer-related things into language_tools
pub fn tokenize_default(sentence: &str) -> Vec<String> {
/*
lazy_static! {
static ref TK: JiebaTokenizer = crate::JiebaTokenizer {};
}
*/
// TODO use static tokenizer
let mut tokenizer = crate::JiebaTokenizer{};
if language_tools::is_chinese(sentence) {
info!("Use Tokenizer for Chinese term {}", sentence);
tokenize_sentence_to_text_vec(&TK, sentence)
tokenize_sentence_to_text_vec(&mut tokenizer, sentence)
} else {
// info!("Space Tokenizer {}", sentence);
let result : Vec<&str> = sentence.split_whitespace()
Expand All @@ -122,13 +112,15 @@ pub fn tokenize_default(sentence: &str) -> Vec<String> {
// vec![String::from(sentence)]
}
}
*/


pub fn tokenize_sentence_to_text_vec(tokenizer: &JiebaTokenizer, sentence: &str) -> Vec<String> {
let tokens = tokenize_sentence_to_vector(&tokenizer, sentence);
use crate::language_tools::tokenizer::FireSeqTokenizer;
pub fn tokenize_sentence_to_text_vec(tokenizer: &mut FireSeqTokenizer, sentence: &str) -> Vec<String> {
let tokens = tokenize_sentence_to_vector(tokenizer, sentence);
tokens_to_text_vec(&tokens)
}
pub fn tokenize_sentence_to_vector(tokenizer: &JiebaTokenizer, sentence: &str) -> Vec<tantivy::tokenizer::Token> {
pub fn tokenize_sentence_to_vector(tokenizer: &mut FireSeqTokenizer, sentence: &str) -> Vec<tantivy::tokenizer::Token> {
use tantivy::tokenizer::*;
let mut token_stream = tokenizer.token_stream(
sentence
Expand Down Expand Up @@ -183,6 +175,7 @@ pub fn generate_server_info_for_test() -> ServerInformation {
server_info
}

/*
#[cfg(test)]
mod test_tokenizer {
#[test]
Expand Down Expand Up @@ -250,3 +243,5 @@ mod test_tokenizer {
}
*/

11 changes: 6 additions & 5 deletions fire_seq_search_server/src/local_llm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ impl LlmEngine {

use std::process::{Command, Stdio};
use std::fs::File;
let cmd = Command::new("sh")
let _cmd = Command::new("sh")
.args([ &lfile, "--nobrowser",
"--port", LLM_SERVER_PORT,
//">/tmp/llamafile.stdout", "2>/tmp/llamafile.stderr",
Expand All @@ -57,8 +57,8 @@ impl LlmEngine {
loop {
let resp = reqwest::get(endpoint.to_owned() + "/health").await;
let resp = match resp {
Err(e) => {
info!("llm not ready ");
Err(_e) => {
info!("llm not ready");
let wait_llm = time::Duration::from_millis(100);
tokio::time::sleep(wait_llm).await;
task::yield_now().await;
Expand Down Expand Up @@ -135,7 +135,8 @@ struct LlamaFileDef {


async fn locate_llamafile() -> Option<String> {
use sha256::try_digest;
// TODO
//use sha256::try_digest;
let mut lf = LlamaFileDef {
filename: "mistral-7b-instruct-v0.2.Q4_0.llamafile".to_owned(),
filepath: None,
Expand All @@ -148,7 +149,7 @@ async fn locate_llamafile() -> Option<String> {
lf.filepath = Some( lf_path.to_owned() );
info!("lf {:?}", &lf);

let ppath = std::path::Path::new(lf_path);
//let ppath = std::path::Path::new(lf_path);
//let val = try_digest(ppath).unwrap();
let val = "1903778f7defd921347b25327ebe5dd902f29417ba524144a8e4f7c32d83dee8";
if val != lf.sha256 {
Expand Down
2 changes: 0 additions & 2 deletions fire_seq_search_server/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::net::SocketAddr;

use log::info;
use fire_seq_search_server::query_engine::{QueryEngine, ServerInformation};
use fire_seq_search_server::local_llm::LlmEngine;
Expand Down
18 changes: 11 additions & 7 deletions fire_seq_search_server/src/query_engine/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Everything about Tantivy should be hidden behind this component

use log::{info, warn};
use crate::{Article, decode_cjk_str, JiebaTokenizer};
use log::{debug, info, warn};
use crate::{Article, decode_cjk_str};
use crate::post_query::post_query_wrapper;


Expand All @@ -25,9 +25,10 @@ pub struct ServerInformation {
pub host: String,
}

use crate::language_tools::tokenizer::FireSeqTokenizer;
struct DocumentSetting {
schema: tantivy::schema::Schema,
tokenizer: JiebaTokenizer,
tokenizer: FireSeqTokenizer,
}

use crate::local_llm::LlmEngine;
Expand All @@ -50,6 +51,8 @@ impl QueryEngine {
let index = indexing_documents(&server_info, &document_setting, &loaded_articles);
let (reader, query_parser) = build_reader_parser(&index, &document_setting);

debug!("Query engine construction finished");

QueryEngine {
server_info,
reader,
Expand Down Expand Up @@ -125,7 +128,7 @@ fn indexing_documents(server_info: &ServerInformation,
let schema = &document_setting.schema;
let index = tantivy::Index::create_in_ram(schema.clone());

index.tokenizers().register(crate::TOKENIZER_ID, document_setting.tokenizer.clone());
index.tokenizers().register(TOKENIZER_ID, document_setting.tokenizer.clone());

let mut index_writer = index.writer(50_000_000).unwrap();

Expand Down Expand Up @@ -161,18 +164,19 @@ fn build_document_setting() -> DocumentSetting {
}
}

use crate::language_tools::tokenizer::TOKENIZER_ID;
fn build_schema_tokenizer() -> (tantivy::schema::Schema,
JiebaTokenizer
FireSeqTokenizer
// Box<dyn tantivy::tokenizer::Tokenizer>
) {
let mut schema_builder = tantivy::schema::SchemaBuilder::default();
let text_indexing = tantivy::schema::TextFieldIndexing::default()
.set_tokenizer(crate::TOKENIZER_ID) // Set custom tokenizer
.set_tokenizer(TOKENIZER_ID) // Set custom tokenizer
.set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions);
let text_options = tantivy::schema::TextOptions::default()
.set_indexing_options(text_indexing)
.set_stored();
let tokenizer:JiebaTokenizer = JiebaTokenizer {};
let tokenizer = FireSeqTokenizer {};

let _title = schema_builder.add_text_field("title", text_options.clone());
let _body = schema_builder.add_text_field("body", text_options);
Expand Down

0 comments on commit 8116608

Please sign in to comment.