From 7692bd9091380858b0cbeb2fa10d8c01dabcba91 Mon Sep 17 00:00:00 2001 From: Zhenbo Li <3221521+Endle@users.noreply.github.com> Date: Sat, 21 Sep 2024 10:23:19 -0400 Subject: [PATCH] Rewrite load notebook logic, support Obsidian's recursive structure (#147) --- fire_seq_search_server/Cargo.toml | 4 +- fire_seq_search_server/obsidian.sh | 6 +- fire_seq_search_server/src/load_notes/mod.rs | 163 +++++++++--------- fire_seq_search_server/src/main.rs | 6 +- .../src/markdown_parser/mod.rs | 11 +- .../src/query_engine/mod.rs | 105 +++++++++-- 6 files changed, 190 insertions(+), 105 deletions(-) mode change 100644 => 100755 fire_seq_search_server/obsidian.sh diff --git a/fire_seq_search_server/Cargo.toml b/fire_seq_search_server/Cargo.toml index e052d4f..c677c75 100644 --- a/fire_seq_search_server/Cargo.toml +++ b/fire_seq_search_server/Cargo.toml @@ -7,7 +7,7 @@ license = "MIT" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [features] -default = ["llm"] +#default = ["llm"] llm = [] [dependencies] @@ -37,6 +37,7 @@ env_logger = "0.11.5" clap = { version = "4.0", features = ["derive"] } lazy_static = "1.4.0" rayon = "1.5" +futures = "0.3" urlencoding = "2.1.0" @@ -66,5 +67,4 @@ pdf-extract-temporary-mitigation-panic = "0.7.1" # TODO I should make them optional sha256 = "1.5.0" reqwest = { version = "0.12", features = ["json"] } -futures = "0.3" serde_derive = "1.0.209" diff --git a/fire_seq_search_server/obsidian.sh b/fire_seq_search_server/obsidian.sh old mode 100644 new mode 100755 index 61e9d91..6baf5cd --- a/fire_seq_search_server/obsidian.sh +++ b/fire_seq_search_server/obsidian.sh @@ -1,8 +1,8 @@ set -e cargo build rm ./fire_seq_search_server -f -cp --force target/debug/fire_seq_search_server.exe ./fire_seq_search_server +cp --force target/debug/fire_seq_search_server ./fire_seq_search_server RUST_BACKTRACE=1 RUST_LOG=debug ./fire_seq_search_server \ ---notebook_path /c/Users/z2369li/Documents/graph-note-of-greek-myth/希腊神话 \ ---obsidian-md + --notebook_path ~/Documents/obsidian-hub-main \ + --obsidian-md diff --git a/fire_seq_search_server/src/load_notes/mod.rs b/fire_seq_search_server/src/load_notes/mod.rs index 2c2d027..96d295e 100644 --- a/fire_seq_search_server/src/load_notes/mod.rs +++ b/fire_seq_search_server/src/load_notes/mod.rs @@ -7,6 +7,86 @@ use crate::query_engine::ServerInformation; use crate::JOURNAL_PREFIX; +use std::borrow::Cow; +use std::borrow::Borrow; + +#[derive(Debug, Clone)] +pub struct NoteListItem { + pub realpath: String, + pub title: String, +} + +pub fn retrive_note_list(server_info: &ServerInformation) -> Vec { + let path: &str = &server_info.notebook_path; + let note_list = list_directory( Cow::from(path) , true); + + // TODO didn't handle logseq + note_list +} + +fn list_directory(path: Cow<'_, str>, recursive: bool) -> Vec { + debug!("Listing directory {}", &path); + let mut result = Vec::new(); + + let path_ref: &str = path.borrow(); + let notebooks = match std::fs::read_dir(path_ref) { + Ok(x) => x, + Err(e) => { + error!("Fatal error ({:?}) when reading {}", e, &path); + process::abort(); + } + }; + + for note_result in notebooks { + let entry = match note_result { + Ok(x) => x, + Err(e) => { + error!("Error during looping {:?}", &e); + continue; + } + }; + let file_type = match entry.file_type() { + Ok(x) => x, + Err(e) => { + error!("Error: Can't get file type {:?} {:?}", &entry, &e); + continue; + } + }; + + let entry_path = entry.path(); + let entry_path_str = entry_path.to_string_lossy(); + + if file_type.is_dir() { + if recursive { + let next = list_directory(entry_path_str, true); + result.extend(next); + } + continue; + } + + if !entry_path_str.ends_with(".md") { + info!("skip non-md file {:?}", &entry); + continue; + } + + let note_title = match entry_path.file_stem() { + Some(osstr) => osstr.to_str().unwrap(), + None => { + error!("Couldn't get file_stem for {:?}", entry_path); + continue; + } + }; + let row = NoteListItem { + realpath: entry_path_str.to_string(), + title: note_title.to_string(), + }; + result.push(row); + } + + return result; +} + +/* pub fn read_all_notes(server_info: &ServerInformation) -> Vec<(String, String)> { // I should remove the unwrap and convert it into map let path: &str = &server_info.notebook_path; @@ -26,7 +106,10 @@ pub fn read_all_notes(server_info: &ServerInformation) -> Vec<(String, String)> (title.to_string(), content) }).collect(); //silly collect. - // TODO: Silly filter + if server_info.exclude_zotero_items { + error!("exclude zotero disabled"); + } + /* for (file_name, contents) in pages_tmp { // info!("File Name: {}", &file_name); if server_info.exclude_zotero_items && file_name.starts_with('@') { @@ -34,6 +117,7 @@ pub fn read_all_notes(server_info: &ServerInformation) -> Vec<(String, String)> } pages.push((file_name,contents)); } + */ if server_info.enable_journal_query { info!("Loading journals"); let journals_page = path.clone() + "/journals"; @@ -56,84 +140,9 @@ pub fn read_all_notes(server_info: &ServerInformation) -> Vec<(String, String)> } -pub fn read_specific_directory(path: &str) -> Vec<(String, String)> { - info!("Try to read {}", &path); - let notebooks = match std::fs::read_dir(path) { - Ok(x) => x, - Err(e) => { - error!("Fatal error ({:?}) when reading {}", e, path); - process::abort(); - } - }; - let mut note_filenames: Vec = Vec::new(); - for note in notebooks { - let note : DirEntry = note.unwrap(); - note_filenames.push(note); - } - // debug!("Note titles: {:?}", ¬e_filenames); - let result: Vec<(String,String)> = note_filenames.par_iter() - .map(|note| read_md_file_wo_parse(¬e)) - .filter(|x| (&x).is_some()) - .map(|x| x.unwrap()) - .collect(); - info!("Loaded {} notes from {}", result.len(), path); - // info!("After map {:?}", &result); - - result -} - +*/ -/// -/// -/// # Arguments -/// -/// * `note`: -/// -/// returns: Option<(String, String)> -/// -/// First: title (filename) -/// Second: full raw text -/// -/// I would delay the parsing job, so it could be couples with server info. -Zhenbo Li 2023-02-17 -/// If input is a directory or DS_STORE, return None -/// -pub fn read_md_file_wo_parse(note: &std::fs::DirEntry) -> Option<(String, String)> { - if let Ok(file_type) = note.file_type() { - // Now let's show our entry's file type! - debug!("{:?}: {:?}", note.path(), file_type); - if file_type.is_dir() { - debug!("{:?} is a directory, skipping", note.path()); - return None; - } - } else { - warn!("Couldn't get file type for {:?}", note.path()); - return None; - } - - let note_path = note.path(); - let note_title = match note_path.file_stem() { - Some(osstr) => osstr.to_str().unwrap(), - None => { - error!("Couldn't get file_stem for {:?}", note.path()); - return None; - } - }; - debug!("note title: {}", ¬e_title); - let content : String = match std::fs::read_to_string(¬e_path) { - Ok(c) => c, - Err(e) => { - if note_title.to_lowercase() == ".ds_store" { - debug!("Ignore .DS_Store for mac"); - } else { - error!("Error({:?}) when reading the file {:?}", e, note_path); - } - return None; - } - }; - - Some((note_title.to_string(),content)) -} diff --git a/fire_seq_search_server/src/main.rs b/fire_seq_search_server/src/main.rs index a9c1ea4..b5f39ec 100644 --- a/fire_seq_search_server/src/main.rs +++ b/fire_seq_search_server/src/main.rs @@ -68,7 +68,7 @@ async fn main() { let matches = Cli::parse(); let server_info: ServerInformation = build_server_info(matches); - let mut engine = QueryEngine::construct(server_info); + let mut engine = QueryEngine::construct(server_info).await; info!("query engine build finished"); if cfg!(feature="llm") { @@ -77,15 +77,13 @@ async fn main() { let llm_poll = llm_arc.clone(); engine.llm = Some(llm_arc); - let poll_handle = tokio::spawn( async move { + let _poll_handle = tokio::spawn( async move { loop { llm_poll.call_llm_engine().await; let wait_llm = tokio::time::Duration::from_millis(500); tokio::time::sleep(wait_llm).await; } }); -// poll_handle.await; - } let engine_arc = std::sync::Arc::new(engine); diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index 26baf8a..62705ce 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -7,9 +7,9 @@ use crate::query_engine::ServerInformation; // https://docs.rs/regex/latest/regex/#repetitions // https://stackoverflow.com/a/8303552/1166518 -pub fn exclude_advanced_query(md: &str) -> Cow { +pub fn exclude_advanced_query(md: Cow<'_,str>) -> Cow<'_, str> { if !md.contains('#') { - return Cow::Borrowed(md); + return md; } lazy_static! { @@ -17,8 +17,7 @@ pub fn exclude_advanced_query(md: &str) -> Cow { r"\#\+BEGIN_QUERY[\S\s]+?\#\+END_QUERY") .unwrap(); } - // return RE.replace_all(&md, " ") - return RE.replace_all(&md, " "); + return RE.replace_all(&md, " ").into_owned().into(); } fn hack_specific_chars_cow(text: Cow) -> String { @@ -27,7 +26,7 @@ fn hack_specific_chars_cow(text: Cow) -> String { text.replace(bullet, " ") } -pub fn parse_logseq_notebook(md: &str, title: &str, server_info: &ServerInformation) -> String { +pub fn parse_logseq_notebook(md: Cow<'_,str>, title: &str, server_info: &ServerInformation) -> String { // Now we do some parsing for this file let content = exclude_advanced_query(md); let content = hack_specific_chars_cow(content); @@ -50,4 +49,4 @@ fn hack_specific_chars(text: String) -> String { let bullet = char::from_u32(0x00002022).unwrap(); // println!("{}", bullet); text.replace(bullet, " ") -} \ No newline at end of file +} diff --git a/fire_seq_search_server/src/query_engine/mod.rs b/fire_seq_search_server/src/query_engine/mod.rs index cb7c021..51a297e 100644 --- a/fire_seq_search_server/src/query_engine/mod.rs +++ b/fire_seq_search_server/src/query_engine/mod.rs @@ -1,12 +1,13 @@ // Everything about Tantivy should be hidden behind this component -use log::{debug, info, warn}; +use log::{debug, info, warn, error}; use crate::{Article, decode_cjk_str}; use crate::post_query::post_query_wrapper; use std::sync::Arc; +use std::borrow::Cow; // This struct should be immutable when the program starts running #[derive(Debug, Clone, serde::Serialize)] @@ -39,19 +40,35 @@ pub struct QueryEngine { pub server_info: ServerInformation, reader: tantivy::IndexReader, query_parser: tantivy::query::QueryParser, - articles: Vec
, //TODO remove it. only word cloud needs it + //articles: Vec
, //TODO remove it. only word cloud needs it pub llm: Option>, } +use tantivy::IndexWriter; +use tantivy::TantivyDocument; + +use crate::load_notes::NoteListItem; +use futures::stream::FuturesUnordered; + use futures::StreamExt; + + use tantivy::doc; + impl QueryEngine { - pub fn construct(server_info: ServerInformation) -> Self { + pub async fn construct(server_info: ServerInformation) -> Self { let document_setting: DocumentSetting = build_document_setting(); + let note_list = crate::load_notes::retrive_note_list(&server_info); + let index: tantivy::Index = QueryEngine::build_index(&server_info, + &document_setting, + note_list).await; + + /* let loaded_notes = crate::load_notes::read_all_notes(&server_info); let loaded_articles: Vec
= loaded_notes.into_iter().map( |x| Article{file_name:x.0, content:x.1} ).collect(); let index = indexing_documents(&server_info, &document_setting, &loaded_articles); + */ let (reader, query_parser) = build_reader_parser(&index, &document_setting); debug!("Query engine construction finished"); @@ -60,10 +77,77 @@ impl QueryEngine { server_info, reader, query_parser, - articles: loaded_articles, + // articles: Vec::new(), + // articles: loaded_articles, llm: None, } } + + async fn load_single_note( + server_info: &ServerInformation, + document_setting: &DocumentSetting, + note: NoteListItem, + index_writer: &IndexWriter) { + + info!(" inside future {:?}", note); + + let raw_content = match std::fs::read_to_string(¬e.realpath) { + Ok(s) => s, + Err(e) => { + error!("Failed to read {:?} err({:?}, skipping", ¬e, &e); + return; + } + }; + + let content = crate::markdown_parser::parse_logseq_notebook( + Cow::from(raw_content), ¬e.title, server_info); + + let schema = &document_setting.schema; + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + index_writer.add_document( + tantivy::doc!{ + title => note.title, + body => content, + } + ).unwrap(); + } + + async fn load_all_notes(server_info: &ServerInformation, + document_setting: &DocumentSetting, + note_list: Vec, + index_writer: &IndexWriter) { + + let mut futs: FuturesUnordered<_> = FuturesUnordered::new(); + for article in note_list { + futs.push( + QueryEngine::load_single_note( + server_info, + document_setting, + article, + index_writer) + ); + } + while let Some(_result) = futs.next().await {} + } + async fn build_index(server_info: &ServerInformation, + document_setting: &DocumentSetting, + note_list: Vec) -> tantivy::Index { + + let schema = &document_setting.schema; + let index = tantivy::Index::create_in_ram(schema.clone()); + + index.tokenizers().register(TOKENIZER_ID, document_setting.tokenizer.clone()); + let mut index_writer = index.writer(50_000_000).unwrap(); + + QueryEngine::load_all_notes(&server_info, + &document_setting, + note_list, + &index_writer).await; + + index_writer.commit().unwrap(); + index + } } #[derive(Debug)] @@ -95,11 +179,9 @@ impl DocData { } impl QueryEngine { - - - pub fn generate_wordcloud(self: &Self) -> String { - crate::word_frequency::generate_wordcloud(&self.articles) + String::from("TODO: wordcloud is turned off") + //crate::word_frequency::generate_wordcloud(&self.articles) } pub async fn query_pipeline(self: &Self, term: String) -> String { @@ -126,7 +208,6 @@ impl QueryEngine { let json = serde_json::to_string(&result).unwrap(); - // info!("Search result {}", &json); json } @@ -156,7 +237,6 @@ impl QueryEngine { }; tokio::time::sleep(wait_llm).await; } - // llm.summarize(&title).await } pub async fn summarize(&self, title: String) -> String { info!("Called summarize on {}", &title); @@ -198,6 +278,7 @@ fn build_reader_parser(index: &tantivy::Index, document_setting: &DocumentSettin (reader, query_parser) } +/* fn indexing_documents(server_info: &ServerInformation, document_setting: &DocumentSetting, pages:&Vec) -> tantivy::Index { @@ -225,12 +306,10 @@ fn indexing_documents(server_info: &ServerInformation, body => article.content.clone()} ).unwrap(); } - - - index_writer.commit().unwrap(); index } +*/