diff --git a/docs/release_notes_0.2_2024Sep.md b/docs/release_notes_0.2_2024Sep.md new file mode 100644 index 0000000..fa69457 --- /dev/null +++ b/docs/release_notes_0.2_2024Sep.md @@ -0,0 +1,4 @@ + + +https://github.com/user-attachments/assets/b0a4ca66-0a33-401a-a916-af7a69f2ae7b + diff --git a/fire_seq_search_server/Cargo.toml b/fire_seq_search_server/Cargo.toml index c677c75..9dd0367 100644 --- a/fire_seq_search_server/Cargo.toml +++ b/fire_seq_search_server/Cargo.toml @@ -6,9 +6,6 @@ license = "MIT" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html -[features] -#default = ["llm"] -llm = [] [dependencies] @@ -63,8 +60,16 @@ pulldown-cmark = { version = "0.9.2", default-features = false } pdf-extract-temporary-mitigation-panic = "0.7.1" -# llm related -# TODO I should make them optional -sha256 = "1.5.0" -reqwest = { version = "0.12", features = ["json"] } -serde_derive = "1.0.209" + +# TODO Currently turn them off will make cargo build fail +# I should make these deps optional, so those who doesn't want LLM could have a smaller binary +sha256 = { version = "1.5.0", optional = true } +reqwest = { version = "0.12", features = ["json"], optional = false } +serde_derive = { version = "1.0.209", optional = false} + +[features] +#default = ["llm"] +llm = ["sha256", + #"serde_derive", + #"request" +] diff --git a/fire_seq_search_server/debug_server_mac.sh b/fire_seq_search_server/debug_server_mac.sh index accc2a3..6438843 100644 --- a/fire_seq_search_server/debug_server_mac.sh +++ b/fire_seq_search_server/debug_server_mac.sh @@ -1,7 +1,7 @@ set -e rm -f ./fire_seq_search_server #nix-shell -p cargo -p rustc -p libiconv --run "cargo build" -cargo build +cargo build --features llm cp target/debug/fire_seq_search_server ./fire_seq_search_server export RUST_LOG="warn,fire_seq_search_server=info" diff --git a/fire_seq_search_server/deny.toml b/fire_seq_search_server/deny.toml index 95c657c..944f728 100644 --- a/fire_seq_search_server/deny.toml +++ b/fire_seq_search_server/deny.toml @@ -23,6 +23,7 @@ allow = [ "MIT", "Apache-2.0", "BSD-2-Clause", "BSD-3-Clause", "CC0-1.0", + "MPL-2.0", ] # The confidence threshold for detecting a license from license text. # The higher the value, the more closely the license text must be to the @@ -34,7 +35,6 @@ confidence-threshold = 0.8 exceptions = [ { name = "fastdivide", allow = ["zlib-acknowledgement"] }, { name = "unicode-ident", allow = ["Unicode-DFS-2016"] }, - { allow = ["Unlicense"], crate = "measure_time" }, # tantivy ] # This section is considered when running `cargo deny check bans`. diff --git a/fire_seq_search_server/obsidian.sh b/fire_seq_search_server/obsidian.sh index 6baf5cd..823317e 100755 --- a/fire_seq_search_server/obsidian.sh +++ b/fire_seq_search_server/obsidian.sh @@ -1,8 +1,10 @@ set -e -cargo build +cargo build --features llm rm ./fire_seq_search_server -f cp --force target/debug/fire_seq_search_server ./fire_seq_search_server +NOTEBOOK_NAME=AstroWiki_2.0-main + RUST_BACKTRACE=1 RUST_LOG=debug ./fire_seq_search_server \ - --notebook_path ~/Documents/obsidian-hub-main \ + --notebook_path ~/Documents/$NOTEBOOK_NAME \ --obsidian-md diff --git a/fire_seq_search_server/src/http_client/endpoints.rs b/fire_seq_search_server/src/http_client/endpoints.rs index 89bbc6f..0f40899 100644 --- a/fire_seq_search_server/src/http_client/endpoints.rs +++ b/fire_seq_search_server/src/http_client/endpoints.rs @@ -1,10 +1,10 @@ use std::sync::Arc; -use log::{debug, info}; +use log::{debug}; use crate::query_engine::{QueryEngine, ServerInformation}; use axum::Json; use axum::extract::State; -use axum::{response::Html, routing::get, Router, extract::Path}; +use axum::{response::Html, extract::Path}; pub async fn get_server_info(State(engine_arc): State>) -> Json { diff --git a/fire_seq_search_server/src/lib.rs b/fire_seq_search_server/src/lib.rs index 7102a94..bc397b3 100644 --- a/fire_seq_search_server/src/lib.rs +++ b/fire_seq_search_server/src/lib.rs @@ -8,8 +8,9 @@ pub mod word_frequency; pub mod local_llm; -use log::{debug, info}; +use log::debug; use crate::query_engine::ServerInformation; +use crate::query_engine::NotebookSoftware::Logseq; #[macro_use] @@ -19,6 +20,7 @@ pub static JOURNAL_PREFIX: &str = "@journal@"; pub struct Article { + #[allow(dead_code)] /* TODO rethink if we need it 2024 Sep 21 */ file_name: String, content: String } @@ -72,7 +74,6 @@ tanvity's default tokenizer will lowercase all English characters. However, I think there could be a better approach 1. use https://github.com/pemistahl/lingua-rs to determine the language of the text 2. Select proper tokenizer - */ fn process_token_text(text: &str, indices: &Vec<(usize, char)>, token: &jieba_rs::Token<'_>) -> Option { let raw = String::from(&text[(indices[token.start].0)..(indices[token.end].0)]); let lower = raw.to_lowercase(); @@ -82,6 +83,7 @@ fn process_token_text(text: &str, indices: &Vec<(usize, char)>, token: &jieba_rs Some(lower) } } + */ // TODO use stub now pub fn tokenize_default(sentence: &str) -> Vec { @@ -168,7 +170,7 @@ pub fn generate_server_info_for_test() -> ServerInformation { show_summary_single_line_chars_limit: 0, parse_pdf_links: false, exclude_zotero_items: false, - obsidian_md: false, + software: Logseq, convert_underline_hierarchy: true, host: "127.0.0.1:22024".to_string(), llm_enabled: false, diff --git a/fire_seq_search_server/src/load_notes/mod.rs b/fire_seq_search_server/src/load_notes/mod.rs index 96d295e..9ac794c 100644 --- a/fire_seq_search_server/src/load_notes/mod.rs +++ b/fire_seq_search_server/src/load_notes/mod.rs @@ -1,10 +1,7 @@ -use std::fs::DirEntry; -use log::{debug, error, info, warn}; +use log::{debug, error, info}; use std::process; -use rayon::prelude::*; use crate::query_engine::ServerInformation; -use crate::JOURNAL_PREFIX; use std::borrow::Cow; @@ -16,10 +13,24 @@ pub struct NoteListItem { pub title: String, } +use crate::query_engine::NotebookSoftware; pub fn retrive_note_list(server_info: &ServerInformation) -> Vec { let path: &str = &server_info.notebook_path; - let note_list = list_directory( Cow::from(path) , true); + let note_list = match &server_info.software { + NotebookSoftware::Obsidian => list_directory( Cow::from(path) , true), + NotebookSoftware::Logseq => { + let pp = path.to_string() + "/pages"; + let mut pages = list_directory( Cow::from(pp), false ); + + // TODO Journal prefix + let pp = path.to_string() + "/journals"; + let jours = list_directory( Cow::from(pp), false ); + + pages.extend(jours); + pages + }, + }; // TODO didn't handle logseq note_list } @@ -82,66 +93,9 @@ fn list_directory(path: Cow<'_, str>, recursive: bool) -> Vec { }; result.push(row); } - return result; } -/* -pub fn read_all_notes(server_info: &ServerInformation) -> Vec<(String, String)> { - // I should remove the unwrap and convert it into map - let path: &str = &server_info.notebook_path; - let path = path.to_owned(); - let pages_path = if server_info.obsidian_md { - path.clone() - } else{ - path.clone() + "/pages" - }; - - - let mut pages: Vec<(String, String)> = Vec:: new(); - - let pages_tmp: Vec<(String, String)> = read_specific_directory(&pages_path).par_iter() - .map(|(title,md)| { - let content = crate::markdown_parser::parse_logseq_notebook(md, title, server_info); - (title.to_string(), content) - }).collect(); //silly collect. - - if server_info.exclude_zotero_items { - error!("exclude zotero disabled"); - } - /* - for (file_name, contents) in pages_tmp { - // info!("File Name: {}", &file_name); - if server_info.exclude_zotero_items && file_name.starts_with('@') { - continue; - } - pages.push((file_name,contents)); - } - */ - if server_info.enable_journal_query { - info!("Loading journals"); - let journals_page = path.clone() + "/journals"; - let journals:Vec<(String, String)> - = read_specific_directory(&journals_page).par_iter() - .map(|(title,md)| { - let content = crate::markdown_parser::parse_logseq_notebook(md, title, server_info); - let tantivy_title = JOURNAL_PREFIX.to_owned() + &title; - (tantivy_title, content) - }).collect(); //silly collect. - - - for (file_name, contents) in journals { - pages.push((file_name,contents)); - } - - } - - pages - -} - - -*/ diff --git a/fire_seq_search_server/src/local_llm/mod.rs b/fire_seq_search_server/src/local_llm/mod.rs index 0b15c9e..9e8673d 100644 --- a/fire_seq_search_server/src/local_llm/mod.rs +++ b/fire_seq_search_server/src/local_llm/mod.rs @@ -1,17 +1,35 @@ use log::{info, error}; -use crate::query_engine::ServerInformation; -use reqwest; +use crate::query_engine::DocData; + use std::collections::HashMap; use std::collections::VecDeque; +use std::process::{Command, Stdio}; +use std::fs::File; + +use std::sync::Arc; +use tokio::sync::Mutex; +use tokio::task::yield_now; +use tokio::task; +use tokio::time; + +use std::borrow::Cow; +use std::borrow::Cow::Borrowed; + + +//#[cfg(feature = "llm")] +use { + reqwest, + reqwest::StatusCode, + shellexpand::tilde, + + serde_derive::Deserialize, + serde_derive::Serialize, +}; -// Generated by https://transform.tools/json-to-rust-serde -use serde_derive::Deserialize; -use serde_derive::Serialize; -use serde; // TODO Allow user to set prompt, instead of hard-coded in code -const prompt_string: &'static str = r##" +const HARD_CODED_PROMPT_STR: &'static str = r##" You are a seasoned summary expert, capable of condensing and summarizing given articles, papers, or posts, accurately conveying the main idea to make the content easier to understand. You place great emphasis on user experience, never adding irrelevant content like "Summary," "The summary is as follows," "Original text," "You can check the original text if interested," or "Original link." Your summaries always convey the core information directly. @@ -22,6 +40,7 @@ You are adept at handling various large, small, and even chaotic text content, a "##; +// Generated by https://transform.tools/json-to-rust-serde #[derive(Debug, Serialize, Deserialize)] pub struct OpenAiData { pub model: String, @@ -68,9 +87,6 @@ pub struct HealthCheck { // End genereated const LLM_SERVER_PORT: &str = "8081"; // TODO Remove this magic number -use std::sync::Arc; -//use std::sync::Mutex; -use tokio::sync::Mutex; struct JobProcessor { done_job: HashMap, @@ -102,12 +118,6 @@ pub struct LlmEngine { -use std::borrow::Cow; -use std::borrow::Cow::Borrowed; - -use tokio::task::yield_now; -use tokio::task; -use crate::query_engine::DocData; impl LlmEngine { pub async fn llm_init() -> Self { info!("llm called"); @@ -115,8 +125,6 @@ impl LlmEngine { let lfile = locate_llamafile().await; let lfile:String = lfile.unwrap(); - use std::process::{Command, Stdio}; - use std::fs::File; let _cmd = Command::new("sh") .args([ &lfile, "--nobrowser", "--port", LLM_SERVER_PORT, @@ -127,7 +135,6 @@ impl LlmEngine { .spawn() .expect("llm model failed to launch"); - use tokio::time; yield_now().await; let wait_llm = time::Duration::from_millis(500); tokio::time::sleep(wait_llm).await; @@ -136,7 +143,6 @@ impl LlmEngine { let endpoint = format!("http://127.0.0.1:{}", LLM_SERVER_PORT).to_string(); - use reqwest::StatusCode; loop { let resp = reqwest::get(endpoint.to_owned() + "/health").await; let resp = match resp { @@ -159,7 +165,7 @@ impl LlmEngine { let client = reqwest::Client::new(); info!("llm engine initialized"); - let mut map = Arc::new(Mutex::new( + let map = Arc::new(Mutex::new( JobProcessor::new())); Self { endpoint, @@ -178,7 +184,8 @@ impl LlmEngine { } let mut msgs = Vec::new(); - let mut chat_text = prompt_string.to_owned(); + let prompt_string = &HARD_CODED_PROMPT_STR; + let mut chat_text = prompt_string.to_string(); chat_text += &full_text; msgs.push( build_message(chat_text) ); @@ -216,14 +223,13 @@ impl LlmEngine{ } pub async fn call_llm_engine(&self) { - let health = self.health().await.unwrap(); if health.slots_idle == 0 { info!("No valid slot, continue"); return; } - let mut next_job: Option = None; + let next_job: Option; let mut jcache = self.job_cache.lock().await;//.unwrap(); next_job = jcache.job_queue.pop_front(); @@ -246,8 +252,7 @@ impl LlmEngine{ let summarize_result = self.summarize(&doc.body).await; info!("Finished summarize job: {}", &title); - let mut jcache = self.job_cache.lock().await;//.unwrap(); - next_job = jcache.job_queue.pop_front(); + let mut jcache = self.job_cache.lock().await; jcache.done_job.insert(title, summarize_result); drop(jcache); } @@ -282,11 +287,11 @@ struct LlamaFileDef { pub filename: String, pub filepath: Option, pub sha256: String, + #[allow(dead_code)] /* TODO rethink if we want auto download 2024 Sep 21 */ pub download_link: String, } -use shellexpand::tilde; async fn locate_llamafile() -> Option { let mut lf = LlamaFileDef { filename: "mistral-7b-instruct-v0.2.Q4_0.llamafile".to_owned(), @@ -300,7 +305,7 @@ async fn locate_llamafile() -> Option { lf.filepath = Some( lf_path.to_owned() ); info!("lf {:?}", &lf); - let ppath = std::path::Path::new(&lf_path); + let _ppath = std::path::Path::new(&lf_path); //let val = sha256::try_digest(ppath).unwrap(); let val = "1903778f7defd921347b25327ebe5dd902f29417ba524144a8e4f7c32d83dee8"; if val != lf.sha256 { diff --git a/fire_seq_search_server/src/main.rs b/fire_seq_search_server/src/main.rs index b5f39ec..37f7951 100644 --- a/fire_seq_search_server/src/main.rs +++ b/fire_seq_search_server/src/main.rs @@ -2,6 +2,7 @@ use log::info; use fire_seq_search_server::query_engine::{QueryEngine, ServerInformation}; use fire_seq_search_server::local_llm::LlmEngine; +use fire_seq_search_server::query_engine::NotebookSoftware::*; use clap::Parser; @@ -118,6 +119,10 @@ fn build_server_info(args: Cli) -> ServerInformation { } }; let host: String = args.host.clone().unwrap_or_else(|| "127.0.0.1:3030".to_string()); + let mut software = Logseq; + if args.obsidian_md { + software = Obsidian; + } ServerInformation{ notebook_path: args.notebook_path, notebook_name, @@ -127,7 +132,7 @@ fn build_server_info(args: Cli) -> ServerInformation { args.show_summary_single_line_chars_limit, parse_pdf_links: args.parse_pdf_links, exclude_zotero_items:args.exclude_zotero_items, - obsidian_md: args.obsidian_md, + software, convert_underline_hierarchy: true, host, llm_enabled: cfg!(feature="llm"), diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index 62705ce..fc727f8 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -26,13 +26,39 @@ fn hack_specific_chars_cow(text: Cow) -> String { text.replace(bullet, " ") } +use crate::query_engine::NotebookSoftware; +use std::borrow::Borrow; +use log::info; + +fn remove_obsidian_header<'a>(content: Cow<'a, str>) -> Cow<'a, str> { + lazy_static! { + static ref RE: Regex = Regex::new( + r"---[\s\S]*?---" + ).unwrap(); + } + info!("from {:?}", &content); + let cr = content.borrow(); + let ret: Cow = RE.replace(cr, " "); + info!("into {:?}", &ret); + ret.into_owned().into() +} + pub fn parse_logseq_notebook(md: Cow<'_,str>, title: &str, server_info: &ServerInformation) -> String { // Now we do some parsing for this file let content = exclude_advanced_query(md); let content = hack_specific_chars_cow(content); + + let content = Cow::from(content); + let content = match &server_info.software { + NotebookSoftware::Obsidian => remove_obsidian_header(content), + _ => content, + }; let content: String = markdown_to_text::convert_from_logseq( &content, title, server_info); + + //let content = content.into_owned(); content + } diff --git a/fire_seq_search_server/src/post_query/app_uri.rs b/fire_seq_search_server/src/post_query/app_uri.rs index 7859125..1d9e8c1 100644 --- a/fire_seq_search_server/src/post_query/app_uri.rs +++ b/fire_seq_search_server/src/post_query/app_uri.rs @@ -1,11 +1,13 @@ use log::{error, info}; -use crate::post_query::logseq_uri::generate_logseq_uri; +use crate::post_query::logseq_uri::{generate_logseq_uri,parse_date_from_str}; use crate::post_query::obsidian_uri::generate_obsidian_uri; use crate::query_engine::ServerInformation; + // Maybe I should wrap them with the same interface? -Zhenbo Li 2023-Feb-05 +// Deprecated on 2024-Sep-21 pub fn generate_uri(title: &str, is_page_hit: &bool, server_info: &ServerInformation) -> String { - if server_info.obsidian_md { + if server_info.software == Obsidian { info!("Generating Obsidian URI for {}", title); if !is_page_hit { error!("Journal is unsupported for Obsidian yet"); @@ -14,6 +16,19 @@ pub fn generate_uri(title: &str, is_page_hit: &bool, server_info: &ServerInforma return generate_obsidian_uri(&title, *is_page_hit, &server_info); } - return generate_logseq_uri(&title, &is_page_hit, &server_info); + return generate_logseq_uri(&title, *is_page_hit, &server_info); +} + +use crate::query_engine::NotebookSoftware::{Logseq,Obsidian}; -} \ No newline at end of file +pub fn generate_uri_v2(title: &str, server_info: &ServerInformation) -> String { + match &server_info.software { + Obsidian => generate_obsidian_uri(title, true, server_info), + Logseq => { + let dt = parse_date_from_str(title); + // TODO remove this duplicate calc + // I don't care the performance here, but I want to make code cleaner - 2024 Sep 21 + generate_logseq_uri(title, dt.is_none(), server_info) + } + } +} diff --git a/fire_seq_search_server/src/post_query/hit_parsed.rs b/fire_seq_search_server/src/post_query/hit_parsed.rs index c05f3d1..d030554 100644 --- a/fire_seq_search_server/src/post_query/hit_parsed.rs +++ b/fire_seq_search_server/src/post_query/hit_parsed.rs @@ -1,6 +1,6 @@ use log::debug; use crate::JOURNAL_PREFIX; -use crate::post_query::app_uri::generate_uri; +use crate::post_query::app_uri::generate_uri_v2; use crate::post_query::highlighter::highlight_keywords_in_body; use crate::query_engine::ServerInformation; @@ -48,7 +48,7 @@ impl FireSeqSearchHitParsed { title.to_string() }; - let logseq_uri = generate_uri(&title, &is_page_hit, server_info); + let logseq_uri = generate_uri_v2(&title, server_info); debug!("Processing a hit, title={}, uri={}", &title, &logseq_uri); diff --git a/fire_seq_search_server/src/post_query/logseq_uri.rs b/fire_seq_search_server/src/post_query/logseq_uri.rs index 8aba6e9..16dcfc7 100644 --- a/fire_seq_search_server/src/post_query/logseq_uri.rs +++ b/fire_seq_search_server/src/post_query/logseq_uri.rs @@ -1,4 +1,4 @@ -use log::error; +use log::{error,info}; use crate::ServerInformation; use url::Url; @@ -37,8 +37,8 @@ pub fn process_note_title(file_name: &str, server_info: &ServerInformation) -> S file_name } -pub fn generate_logseq_uri(title: &str, is_page_hit: &bool, server_info: &ServerInformation) -> String { - return if *is_page_hit { +pub fn generate_logseq_uri(title: &str, is_page_hit: bool, server_info: &ServerInformation) -> String { + return if is_page_hit { let title = process_note_title(title, server_info); let mut uri = Url::parse("logseq://graph/").unwrap(); uri.set_path(&server_info.notebook_name); @@ -53,7 +53,7 @@ pub fn generate_logseq_uri(title: &str, is_page_hit: &bool, server_info: &Server } #[derive(PartialEq, Debug)] -struct JournalDate { +pub struct JournalDate { pub year: u32, pub month: u32, pub date: u32, @@ -152,9 +152,9 @@ fn parse_slice_to_u8(slice: Option<&str>) -> Option { } } -fn parse_date_from_str(title: &str) -> Option { +pub fn parse_date_from_str(title: &str) -> Option { if title.len() != 10 { - error!("Journal length unexpected: {}", title); + info!("Journal length unexpected: {}", title); return None; } @@ -205,18 +205,18 @@ mod test_logseq_uri { let server_info = generate_server_info_for_test(); // Don't encode / at here. It would be processed by serde. - 2022-11-27 - let r = generate_logseq_uri("Games/EU4", &true, &server_info); + let r = generate_logseq_uri("Games/EU4", true, &server_info); assert_eq!(&r, "logseq://graph/logseq_notebook?page=Games%2FEU4"); - let r = generate_logseq_uri("Games/赛马娘", &true, &server_info); + let r = generate_logseq_uri("Games/赛马娘", true, &server_info); assert_eq!(&r, "logseq://graph/logseq_notebook?page=Games%2F%E8%B5%9B%E9%A9%AC%E5%A8%98"); let r = generate_logseq_journal_uri("2022_12_14", &server_info); assert_eq!(&r,"logseq://graph/logseq_notebook?page=Dec+14th%2C+2022"); - let r = generate_logseq_uri("fireSeqSearch___test___5", &true, &server_info); + let r = generate_logseq_uri("fireSeqSearch___test___5", true, &server_info); assert_eq!(&r,"logseq://graph/logseq_notebook?page=fireSeqSearch%2Ftest%2F5"); - let r = generate_logseq_uri("C++", &true, &server_info); + let r = generate_logseq_uri("C++", true, &server_info); assert_eq!(&r, "logseq://graph/logseq_notebook?page=C%2B%2B"); } -} \ No newline at end of file +} diff --git a/fire_seq_search_server/src/query_engine/mod.rs b/fire_seq_search_server/src/query_engine/mod.rs index 51a297e..8451e05 100644 --- a/fire_seq_search_server/src/query_engine/mod.rs +++ b/fire_seq_search_server/src/query_engine/mod.rs @@ -1,7 +1,7 @@ // Everything about Tantivy should be hidden behind this component -use log::{debug, info, warn, error}; -use crate::{Article, decode_cjk_str}; +use log::{debug, info, error}; +use crate::decode_cjk_str; use crate::post_query::post_query_wrapper; use std::sync::Arc; @@ -9,6 +9,12 @@ use std::sync::Arc; use std::borrow::Cow; +#[derive(Debug, Clone, serde::Serialize,PartialEq)] +pub enum NotebookSoftware { + Logseq, + Obsidian, +} + // This struct should be immutable when the program starts running #[derive(Debug, Clone, serde::Serialize)] pub struct ServerInformation { @@ -19,7 +25,7 @@ pub struct ServerInformation { pub show_summary_single_line_chars_limit: usize, pub parse_pdf_links: bool, pub exclude_zotero_items:bool, - pub obsidian_md: bool, + pub software: NotebookSoftware, /// Experimental. Not sure if I should use this global config - 2022-12-30 pub convert_underline_hierarchy: bool, @@ -61,14 +67,6 @@ impl QueryEngine { let index: tantivy::Index = QueryEngine::build_index(&server_info, &document_setting, note_list).await; - - /* - let loaded_notes = crate::load_notes::read_all_notes(&server_info); - let loaded_articles: Vec
= loaded_notes.into_iter().map( - |x| Article{file_name:x.0, content:x.1} - ).collect(); - let index = indexing_documents(&server_info, &document_setting, &loaded_articles); - */ let (reader, query_parser) = build_reader_parser(&index, &document_setting); debug!("Query engine construction finished"); @@ -89,8 +87,6 @@ impl QueryEngine { note: NoteListItem, index_writer: &IndexWriter) { - info!(" inside future {:?}", note); - let raw_content = match std::fs::read_to_string(¬e.realpath) { Ok(s) => s, Err(e) => { @@ -278,41 +274,6 @@ fn build_reader_parser(index: &tantivy::Index, document_setting: &DocumentSettin (reader, query_parser) } -/* -fn indexing_documents(server_info: &ServerInformation, - document_setting: &DocumentSetting, - pages:&Vec) -> tantivy::Index { - - let schema = &document_setting.schema; - let index = tantivy::Index::create_in_ram(schema.clone()); - - index.tokenizers().register(TOKENIZER_ID, document_setting.tokenizer.clone()); - - let mut index_writer = index.writer(50_000_000).unwrap(); - - - if server_info.obsidian_md { - warn!("Obsidian mode."); - assert!(!server_info.enable_journal_query); - } - - let title = schema.get_field("title").unwrap(); - let body = schema.get_field("body").unwrap(); - - - for article in pages { - index_writer.add_document( - tantivy::doc!{ title => article.file_name.clone(), - body => article.content.clone()} - ).unwrap(); - } - index_writer.commit().unwrap(); - index -} -*/ - - - fn build_document_setting() -> DocumentSetting { let (schema, tokenizer) = build_schema_tokenizer(); DocumentSetting{ diff --git a/fire_seq_search_server/tests/unit_test_load_notes.rs b/fire_seq_search_server/tests/unit_test_load_notes.rs index 612f640..d553336 100644 --- a/fire_seq_search_server/tests/unit_test_load_notes.rs +++ b/fire_seq_search_server/tests/unit_test_load_notes.rs @@ -1,6 +1,7 @@ -use fire_seq_search_server::load_notes::read_specific_directory; use fire_seq_search_server::markdown_parser::{exclude_advanced_query, parse_to_plain_text}; +use std::borrow::Cow; + fn load_articles() -> Vec<(String, String)> { let r = read_specific_directory("tests/resource/pages"); @@ -39,12 +40,79 @@ fn parse() { #[test] fn exclude_advance_query() { let md = read_file_to_line("advanced_query.md"); - let result = exclude_advanced_query(&md); + let md = Cow::from(md); + let result = exclude_advanced_query(md); assert!(!result.contains("exempli")); assert!(result.contains("In this test page we have")); let md = read_file_to_line("blog_thunderbird_zh.md"); - let result = exclude_advanced_query(&md); + let md = Cow::from(md); + let result = exclude_advanced_query(md.clone()); assert_eq!(md, result); -} \ No newline at end of file +} + + + + + + + +// ===================== +// These functions are removed in https://github.com/Endle/fireSeqSearch/pull/149/commits/7692bd9091380858b0cbeb2fa10d8c01dabcba91 +// aka https://github.com/Endle/fireSeqSearch/pull/147 +// To make unit test happy, I copied them as test helper functions +// Zhenbo - 2024 Sep 21 +use std::fs::DirEntry; +use rayon::iter::IntoParallelRefIterator; +use rayon::iter::ParallelIterator; +use std::process; +fn read_md_file_wo_parse(note: &std::fs::DirEntry) -> Option<(String, String)> { + if let Ok(file_type) = note.file_type() { + // Now let's show our entry's file type! + if file_type.is_dir() { + return None; + } + } else { + return None; + } + + let note_path = note.path(); + let note_title = match note_path.file_stem() { + Some(osstr) => osstr.to_str().unwrap(), + None => { + return None; + } + }; + let content : String = match std::fs::read_to_string(¬e_path) { + Ok(c) => c, + Err(e) => { + if note_title.to_lowercase() == ".ds_store" { + } else { + } + return None; + } + }; + + Some((note_title.to_string(),content)) +} +fn read_specific_directory(path: &str) -> Vec<(String, String)> { + let notebooks = match std::fs::read_dir(path) { + Ok(x) => x, + Err(e) => { + process::abort(); + } + }; + let mut note_filenames: Vec = Vec::new(); + for note in notebooks { + let note : DirEntry = note.unwrap(); + note_filenames.push(note); + } + let result: Vec<(String,String)> = note_filenames.par_iter() + .map(|note| read_md_file_wo_parse(¬e)) + .filter(|x| (&x).is_some()) + .map(|x| x.unwrap()) + .collect(); + + result +}