From 7692bd9091380858b0cbeb2fa10d8c01dabcba91 Mon Sep 17 00:00:00 2001
From: Zhenbo Li <3221521+Endle@users.noreply.github.com>
Date: Sat, 21 Sep 2024 10:23:19 -0400
Subject: [PATCH] Rewrite load notebook logic, support Obsidian's recursive
 structure (#147)

---
 fire_seq_search_server/Cargo.toml             |   4 +-
 fire_seq_search_server/obsidian.sh            |   6 +-
 fire_seq_search_server/src/load_notes/mod.rs  | 163 +++++++++---------
 fire_seq_search_server/src/main.rs            |   6 +-
 .../src/markdown_parser/mod.rs                |  11 +-
 .../src/query_engine/mod.rs                   | 105 +++++++++--
 6 files changed, 190 insertions(+), 105 deletions(-)
 mode change 100644 => 100755 fire_seq_search_server/obsidian.sh
diff --git a/fire_seq_search_server/Cargo.toml b/fire_seq_search_server/Cargo.toml
index e052d4f..c677c75 100644
--- a/fire_seq_search_server/Cargo.toml
+++ b/fire_seq_search_server/Cargo.toml
@@ -7,7 +7,7 @@ license = "MIT"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [features]
-default = ["llm"]
+#default = ["llm"]
 llm = []
 
 [dependencies]
@@ -37,6 +37,7 @@ env_logger = "0.11.5"
 clap = { version = "4.0", features = ["derive"] }
 lazy_static = "1.4.0"
 rayon = "1.5"
+futures = "0.3"
 
 urlencoding = "2.1.0"
 
@@ -66,5 +67,4 @@ pdf-extract-temporary-mitigation-panic = "0.7.1"
 # TODO I should make them optional
 sha256 = "1.5.0"
 reqwest = { version = "0.12", features = ["json"] }
-futures = "0.3"
 serde_derive = "1.0.209"
diff --git a/fire_seq_search_server/obsidian.sh b/fire_seq_search_server/obsidian.sh
old mode 100644
new mode 100755
index 61e9d91..6baf5cd
--- a/fire_seq_search_server/obsidian.sh
+++ b/fire_seq_search_server/obsidian.sh
@@ -1,8 +1,8 @@
 set -e
 cargo build
 rm ./fire_seq_search_server -f
-cp --force target/debug/fire_seq_search_server.exe ./fire_seq_search_server
+cp --force target/debug/fire_seq_search_server ./fire_seq_search_server
 
 RUST_BACKTRACE=1 RUST_LOG=debug ./fire_seq_search_server \
---notebook_path /c/Users/z2369li/Documents/graph-note-of-greek-myth/希腊神话 \
---obsidian-md
+    --notebook_path ~/Documents/obsidian-hub-main \
+    --obsidian-md
diff --git a/fire_seq_search_server/src/load_notes/mod.rs b/fire_seq_search_server/src/load_notes/mod.rs
index 2c2d027..96d295e 100644
--- a/fire_seq_search_server/src/load_notes/mod.rs
+++ b/fire_seq_search_server/src/load_notes/mod.rs
@@ -7,6 +7,86 @@ use crate::query_engine::ServerInformation;
 use crate::JOURNAL_PREFIX;
 
 
+use std::borrow::Cow;
+use std::borrow::Borrow;
+
+#[derive(Debug, Clone)]
+pub struct NoteListItem {
+    pub realpath: String,
+    pub title:    String,
+}
+
+pub fn retrive_note_list(server_info: &ServerInformation) -> Vec<NoteListItem> {
+    let path: &str = &server_info.notebook_path;
+    let note_list = list_directory( Cow::from(path) , true);
+
+    // TODO didn't handle logseq
+    note_list
+}
+
+fn list_directory(path: Cow<'_, str>, recursive: bool) -> Vec<NoteListItem> {
+    debug!("Listing directory {}", &path);
+    let mut result = Vec::new();
+
+    let path_ref: &str = path.borrow();
+    let notebooks = match std::fs::read_dir(path_ref) {
+        Ok(x) => x,
+        Err(e) => {
+            error!("Fatal error ({:?}) when reading {}", e, &path);
+            process::abort();
+        }
+    };
+
+    for note_result in notebooks {
+        let entry = match note_result {
+            Ok(x) => x,
+            Err(e) => {
+                error!("Error during looping {:?}", &e);
+                continue;
+            }
+        };
+        let file_type = match entry.file_type() {
+            Ok(x) => x,
+            Err(e) => {
+                error!("Error: Can't get file type {:?}  {:?}", &entry, &e);
+                continue;
+            }
+        };
+
+        let entry_path = entry.path();
+        let entry_path_str = entry_path.to_string_lossy();
+
+        if file_type.is_dir() {
+            if recursive {
+                let next = list_directory(entry_path_str, true);
+                result.extend(next);
+            }
+            continue;
+        }
+
+        if !entry_path_str.ends_with(".md") {
+            info!("skip non-md file {:?}", &entry);
+            continue;
+        }
+
+        let note_title = match entry_path.file_stem() {
+            Some(osstr) => osstr.to_str().unwrap(),
+            None => {
+                error!("Couldn't get file_stem for {:?}", entry_path);
+                continue;
+            }
+        };
+        let row = NoteListItem {
+            realpath: entry_path_str.to_string(),
+            title: note_title.to_string(),
+        };
+        result.push(row);
+    }
+
+    return result;
+}
+
+/*
 pub fn read_all_notes(server_info: &ServerInformation) -> Vec<(String, String)> {
     // I should remove the unwrap and convert it into map
     let path: &str = &server_info.notebook_path;
@@ -26,7 +106,10 @@ pub fn read_all_notes(server_info: &ServerInformation) -> Vec<(String, String)>
             (title.to_string(), content)
         }).collect(); //silly collect.
 
-    // TODO: Silly filter
+    if server_info.exclude_zotero_items {
+        error!("exclude zotero disabled");
+    }
+    /*
     for (file_name, contents) in pages_tmp {
         // info!("File Name: {}", &file_name);
         if server_info.exclude_zotero_items && file_name.starts_with('@') {
@@ -34,6 +117,7 @@ pub fn read_all_notes(server_info: &ServerInformation) -> Vec<(String, String)>
         }
         pages.push((file_name,contents));
     }
+    */
     if server_info.enable_journal_query {
         info!("Loading journals");
         let journals_page = path.clone() + "/journals";
@@ -56,84 +140,9 @@ pub fn read_all_notes(server_info: &ServerInformation) -> Vec<(String, String)>
 
 }
 
-pub fn read_specific_directory(path: &str) -> Vec<(String, String)> {
-    info!("Try to read {}", &path);
-    let notebooks = match std::fs::read_dir(path) {
-        Ok(x) => x,
-        Err(e) => {
-            error!("Fatal error ({:?}) when reading {}", e, path);
-            process::abort();
-        }
-    };
-    let mut note_filenames: Vec<DirEntry> = Vec::new();
-    for note in notebooks {
-        let note : DirEntry = note.unwrap();
-        note_filenames.push(note);
-    }
-    // debug!("Note titles: {:?}", &note_filenames);
-    let result: Vec<(String,String)> = note_filenames.par_iter()
-        .map(|note|  read_md_file_wo_parse(&note))
-        .filter(|x| (&x).is_some())
-        .map(|x| x.unwrap())
-        .collect();
-    info!("Loaded {} notes from {}", result.len(), path);
-    // info!("After map {:?}", &result);
-
-    result
-}
-
 
+*/
 
 
-///
-///
-/// # Arguments
-///
-/// * `note`:
-///
-/// returns: Option<(String, String)>
-///
-/// First: title (filename)
-/// Second: full raw text
-///
-/// I would delay the parsing job, so it could be couples with server info. -Zhenbo Li 2023-02-17
-/// If input is a directory or DS_STORE, return None
-///
-pub fn read_md_file_wo_parse(note: &std::fs::DirEntry) -> Option<(String, String)> {
-    if let Ok(file_type) = note.file_type() {
-        // Now let's show our entry's file type!
-        debug!("{:?}: {:?}", note.path(), file_type);
-        if file_type.is_dir() {
-            debug!("{:?} is a directory, skipping", note.path());
-            return None;
-        }
-    } else {
-        warn!("Couldn't get file type for {:?}", note.path());
-        return None;
-    }
-
-    let note_path = note.path();
-    let note_title = match note_path.file_stem() {
-        Some(osstr) => osstr.to_str().unwrap(),
-        None => {
-            error!("Couldn't get file_stem for {:?}", note.path());
-            return None;
-        }
-    };
-    debug!("note title: {}", &note_title);
 
-    let content : String = match std::fs::read_to_string(&note_path) {
-        Ok(c) => c,
-        Err(e) => {
-            if note_title.to_lowercase() == ".ds_store" {
-                debug!("Ignore .DS_Store for mac");
-            } else {
-                error!("Error({:?}) when reading the file {:?}", e, note_path);
-            }
-            return None;
-        }
-    };
-
-    Some((note_title.to_string(),content))
-}
 
diff --git a/fire_seq_search_server/src/main.rs b/fire_seq_search_server/src/main.rs
index a9c1ea4..b5f39ec 100644
--- a/fire_seq_search_server/src/main.rs
+++ b/fire_seq_search_server/src/main.rs
@@ -68,7 +68,7 @@ async fn main() {
     let matches = Cli::parse();
     let server_info: ServerInformation = build_server_info(matches);
 
-    let mut engine = QueryEngine::construct(server_info);
+    let mut engine = QueryEngine::construct(server_info).await;
 
     info!("query engine build finished");
     if cfg!(feature="llm") {
@@ -77,15 +77,13 @@ async fn main() {
         let llm_poll = llm_arc.clone();
         engine.llm = Some(llm_arc);
 
-        let poll_handle = tokio::spawn( async move {
+        let _poll_handle = tokio::spawn( async move {
             loop {
                 llm_poll.call_llm_engine().await;
                 let wait_llm = tokio::time::Duration::from_millis(500);
                 tokio::time::sleep(wait_llm).await;
             }
         });
-//        poll_handle.await;
-
     }
 
     let engine_arc = std::sync::Arc::new(engine);
diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs
index 26baf8a..62705ce 100644
--- a/fire_seq_search_server/src/markdown_parser/mod.rs
+++ b/fire_seq_search_server/src/markdown_parser/mod.rs
@@ -7,9 +7,9 @@ use crate::query_engine::ServerInformation;
 
 // https://docs.rs/regex/latest/regex/#repetitions
 // https://stackoverflow.com/a/8303552/1166518
-pub fn exclude_advanced_query(md: &str) -> Cow<str> {
+pub fn exclude_advanced_query(md: Cow<'_,str>) -> Cow<'_, str> {
     if !md.contains('#') {
-        return Cow::Borrowed(md);
+        return md;
     }
 
     lazy_static! {
@@ -17,8 +17,7 @@ pub fn exclude_advanced_query(md: &str) -> Cow<str> {
             r"\#\+BEGIN_QUERY[\S\s]+?\#\+END_QUERY")
             .unwrap();
     }
-    // return RE.replace_all(&md, "    ")
-    return RE.replace_all(&md, "    ");
+    return RE.replace_all(&md, "    ").into_owned().into();
 }
 
 fn hack_specific_chars_cow(text: Cow<str>) -> String {
@@ -27,7 +26,7 @@ fn hack_specific_chars_cow(text: Cow<str>) -> String {
     text.replace(bullet, " ")
 }
 
-pub fn parse_logseq_notebook(md: &str, title: &str, server_info: &ServerInformation) -> String {
+pub fn parse_logseq_notebook(md: Cow<'_,str>, title: &str, server_info: &ServerInformation) -> String {
     // Now we do some parsing for this file
     let content = exclude_advanced_query(md);
     let content = hack_specific_chars_cow(content);
@@ -50,4 +49,4 @@ fn hack_specific_chars(text: String) -> String {
     let bullet = char::from_u32(0x00002022).unwrap();
     // println!("{}", bullet);
     text.replace(bullet, " ")
-}
\ No newline at end of file
+}
diff --git a/fire_seq_search_server/src/query_engine/mod.rs b/fire_seq_search_server/src/query_engine/mod.rs
index cb7c021..51a297e 100644
--- a/fire_seq_search_server/src/query_engine/mod.rs
+++ b/fire_seq_search_server/src/query_engine/mod.rs
@@ -1,12 +1,13 @@
 // Everything about Tantivy should be hidden behind this component
 
-use log::{debug, info, warn};
+use log::{debug, info, warn, error};
 use crate::{Article, decode_cjk_str};
 use crate::post_query::post_query_wrapper;
 use std::sync::Arc;
 
 
 
+use std::borrow::Cow;
 
 // This struct should be immutable when the program starts running
 #[derive(Debug, Clone, serde::Serialize)]
@@ -39,19 +40,35 @@ pub struct QueryEngine {
     pub server_info: ServerInformation,
     reader: tantivy::IndexReader,
     query_parser: tantivy::query::QueryParser,
-    articles: Vec<Article>, //TODO remove it. only word cloud needs it
+    //articles: Vec<Article>, //TODO remove it. only word cloud needs it
     pub llm: Option<Arc<LlmEngine>>,
 }
 
+use tantivy::IndexWriter;
+use tantivy::TantivyDocument;
+
+use crate::load_notes::NoteListItem;
+use futures::stream::FuturesUnordered;
+ use futures::StreamExt;
+
+ use tantivy::doc;
+
 impl QueryEngine {
-    pub fn construct(server_info: ServerInformation) -> Self {
+    pub async fn construct(server_info: ServerInformation) -> Self {
 
         let document_setting: DocumentSetting = build_document_setting();
+        let note_list = crate::load_notes::retrive_note_list(&server_info);
+        let index: tantivy::Index = QueryEngine::build_index(&server_info,
+            &document_setting,
+            note_list).await;
+
+        /*
         let loaded_notes = crate::load_notes::read_all_notes(&server_info);
         let loaded_articles: Vec<Article> = loaded_notes.into_iter().map(
             |x| Article{file_name:x.0, content:x.1}
         ).collect();
         let index = indexing_documents(&server_info, &document_setting, &loaded_articles);
+        */
         let (reader, query_parser) = build_reader_parser(&index, &document_setting);
 
         debug!("Query engine construction finished");
@@ -60,10 +77,77 @@ impl QueryEngine {
             server_info,
             reader,
             query_parser,
-            articles: loaded_articles,
+        //    articles: Vec::new(),
+         //   articles: loaded_articles,
             llm: None,
         }
     }
+
+    async fn load_single_note(
+        server_info: &ServerInformation,
+        document_setting: &DocumentSetting,
+        note: NoteListItem,
+        index_writer: &IndexWriter<TantivyDocument>) {
+
+        info!(" inside future {:?}", note);
+
+        let raw_content = match std::fs::read_to_string(&note.realpath) {
+            Ok(s) => s,
+            Err(e) => {
+                error!("Failed to read {:?} err({:?}, skipping", &note, &e);
+                return;
+            }
+        };
+
+        let content = crate::markdown_parser::parse_logseq_notebook(
+            Cow::from(raw_content), &note.title, server_info);
+
+        let schema = &document_setting.schema;
+        let title = schema.get_field("title").unwrap();
+        let body = schema.get_field("body").unwrap();
+        index_writer.add_document(
+            tantivy::doc!{
+                title => note.title,
+                body => content,
+            }
+        ).unwrap();
+    }
+
+    async fn load_all_notes(server_info: &ServerInformation,
+        document_setting: &DocumentSetting,
+        note_list: Vec<NoteListItem>,
+        index_writer: &IndexWriter<TantivyDocument>) {
+
+        let mut futs: FuturesUnordered<_>  = FuturesUnordered::new();
+        for article in note_list {
+            futs.push(
+                QueryEngine::load_single_note(
+                    server_info,
+                    document_setting,
+                    article,
+                    index_writer)
+            );
+        }
+        while let Some(_result) = futs.next().await {}
+    }
+    async fn build_index(server_info: &ServerInformation,
+        document_setting: &DocumentSetting,
+        note_list: Vec<NoteListItem>) -> tantivy::Index {
+
+        let schema = &document_setting.schema;
+        let index = tantivy::Index::create_in_ram(schema.clone());
+
+        index.tokenizers().register(TOKENIZER_ID, document_setting.tokenizer.clone());
+        let mut index_writer = index.writer(50_000_000).unwrap();
+
+        QueryEngine::load_all_notes(&server_info,
+            &document_setting,
+            note_list,
+            &index_writer).await;
+
+        index_writer.commit().unwrap();
+        index
+    }
 }
 
 #[derive(Debug)]
@@ -95,11 +179,9 @@ impl DocData {
 }
 
 impl QueryEngine {
-
-
-
     pub fn generate_wordcloud(self: &Self) -> String {
-        crate::word_frequency::generate_wordcloud(&self.articles)
+        String::from("TODO: wordcloud is turned off")
+        //crate::word_frequency::generate_wordcloud(&self.articles)
     }
 
     pub async fn query_pipeline(self: &Self, term: String) -> String {
@@ -126,7 +208,6 @@ impl QueryEngine {
 
         let json = serde_json::to_string(&result).unwrap();
 
-        // info!("Search result {}", &json);
         json
     }
 
@@ -156,7 +237,6 @@ impl QueryEngine {
             };
             tokio::time::sleep(wait_llm).await;
         }
-        //    llm.summarize(&title).await
     }
     pub async fn summarize(&self, title: String) -> String {
         info!("Called summarize on {}", &title);
@@ -198,6 +278,7 @@ fn build_reader_parser(index: &tantivy::Index, document_setting: &DocumentSettin
     (reader, query_parser)
 }
 
+/*
 fn indexing_documents(server_info: &ServerInformation,
                       document_setting: &DocumentSetting,
                       pages:&Vec<crate::Article>) -> tantivy::Index {
@@ -225,12 +306,10 @@ fn indexing_documents(server_info: &ServerInformation,
                 body => article.content.clone()}
         ).unwrap();
     }
-
-
-
     index_writer.commit().unwrap();
     index
 }
+*/