From b5eeadc735610a4525f46fe77100fae9f72e3687 Mon Sep 17 00:00:00 2001 From: Zhenbo Li <3221521+Endle@users.noreply.github.com> Date: Tue, 16 Apr 2024 13:16:27 -0400 Subject: [PATCH] change definition of highlight_keywords_in_body (#135) --- fire_seq_search_server/debug_server.sh | 6 ++++-- .../src/post_query/highlighter.rs | 4 +++- .../src/post_query/hit_parsed.rs | 15 ++++----------- fire_seq_search_server/src/post_query/mod.rs | 2 +- .../tests/unit_test_post_query.rs | 14 ++++++++++---- 5 files changed, 22 insertions(+), 19 deletions(-) diff --git a/fire_seq_search_server/debug_server.sh b/fire_seq_search_server/debug_server.sh index e16611f..b72cb07 100644 --- a/fire_seq_search_server/debug_server.sh +++ b/fire_seq_search_server/debug_server.sh @@ -1,8 +1,10 @@ set -e rm -f ./fire_seq_search_server -nix-shell -p cargo -p rustc -p libiconv --run "cargo build" +# nix-shell -p cargo -p rustc -p libiconv --run "cargo build" +cargo build cp target/debug/fire_seq_search_server ./fire_seq_search_server RUST_BACKTRACE=1 RUST_LOG=debug ./fire_seq_search_server \ ---notebook_path /Users/zhenboli/logseq \ +--notebook_path ~/logseq --exclude-zotero-items # --parse-pdf-links +--notebook_path /Users/zhenboli/logseq \ diff --git a/fire_seq_search_server/src/post_query/highlighter.rs b/fire_seq_search_server/src/post_query/highlighter.rs index daf0c3a..eef6a45 100644 --- a/fire_seq_search_server/src/post_query/highlighter.rs +++ b/fire_seq_search_server/src/post_query/highlighter.rs @@ -5,14 +5,16 @@ use regex::RegexBuilder; use lazy_static::lazy_static; use crate::post_query::highlighter::HighlightStatusWithWords::{Highlight, Lowlight}; +use crate::query_engine::ServerInformation; lazy_static! { static ref STOPWORDS_LIST: HashSet = crate::language_tools::generate_stopwords_list(); } pub fn highlight_keywords_in_body(body: &str, term_tokens: &Vec, - show_summary_single_line_chars_limit: usize) -> String { + server_info: &ServerInformation) -> String { + let show_summary_single_line_chars_limit: usize = server_info.show_summary_single_line_chars_limit; let blocks = split_body_to_blocks(body, show_summary_single_line_chars_limit); let nltk = &STOPWORDS_LIST; diff --git a/fire_seq_search_server/src/post_query/hit_parsed.rs b/fire_seq_search_server/src/post_query/hit_parsed.rs index fb233f0..e3aa726 100644 --- a/fire_seq_search_server/src/post_query/hit_parsed.rs +++ b/fire_seq_search_server/src/post_query/hit_parsed.rs @@ -14,20 +14,15 @@ pub struct FireSeqSearchHitParsed { pub logseq_uri: String, } - - - impl FireSeqSearchHitParsed { pub fn from_tantivy(doc: &tantivy::schema::Document, score: f32, term_tokens: &Vec, server_info: &ServerInformation) ->FireSeqSearchHitParsed { - for _field in doc.field_values() { - // debug!("field {:?} ", &field); - } + let title: &str = doc.field_values()[0].value().as_text().unwrap(); let body: &str = doc.field_values()[1].value().as_text().unwrap(); - let summary = highlight_keywords_in_body(body, term_tokens, server_info.show_summary_single_line_chars_limit); + let summary = highlight_keywords_in_body(body, term_tokens, server_info); let mut is_page_hit = true; let title = if title.starts_with(JOURNAL_PREFIX) { @@ -40,9 +35,7 @@ impl FireSeqSearchHitParsed { title.to_string() }; - - let logseq_uri = generate_uri(&title, &is_page_hit, &server_info); - + let logseq_uri = generate_uri(&title, &is_page_hit, server_info); debug!("Processing a hit, title={}, uri={}", &title, &logseq_uri); @@ -102,4 +95,4 @@ mod test_serde { // assert!(serde("Games/EU4").contains("\"logseq://graph/logseq_notebook?page=Games/EU4\"")); // // } -} \ No newline at end of file +} diff --git a/fire_seq_search_server/src/post_query/mod.rs b/fire_seq_search_server/src/post_query/mod.rs index cc7bbf2..3560055 100644 --- a/fire_seq_search_server/src/post_query/mod.rs +++ b/fire_seq_search_server/src/post_query/mod.rs @@ -16,7 +16,7 @@ pub fn post_query_wrapper(top_docs: Vec<(f32, tantivy::DocAddress)>, term: &str, searcher: &tantivy::LeasedItem, server_info: &ServerInformation) -> Vec { - let term_tokens = tokenize_default(&term); + let term_tokens = tokenize_default(term); info!("get term tokens {:?}", &term_tokens); let result: Vec = top_docs.par_iter() .map(|x| parse_and_serde(x, searcher, &term_tokens, server_info)) diff --git a/fire_seq_search_server/tests/unit_test_post_query.rs b/fire_seq_search_server/tests/unit_test_post_query.rs index 4311791..607bbd2 100644 --- a/fire_seq_search_server/tests/unit_test_post_query.rs +++ b/fire_seq_search_server/tests/unit_test_post_query.rs @@ -1,16 +1,22 @@ use fire_seq_search_server::post_query::highlighter::{highlight_keywords_in_body, highlight_sentence_with_keywords, locate_single_keyword, split_body_to_blocks, wrap_text_at_given_spots}; +use fire_seq_search_server::generate_server_info_for_test; fn get_english_text() -> String { std::fs::read_to_string("tests/resource/pages/International Language, Past, Present & Future by Walter John Clark.md") .expect("Should have been able to read the file") } +fn highlight_keywords_in_body_old_2024_apr(body:&str, terms: &Vec, limit:usize) ->String { + let mut server_info = generate_server_info_for_test(); + server_info.show_summary_single_line_chars_limit = limit; + highlight_keywords_in_body(body, terms, &server_info) +} #[test] fn test_empty_key() { let text = "Hello World"; let v = Vec::new(); - let r = highlight_keywords_in_body(text, &v, 120); + let r = highlight_keywords_in_body_old_2024_apr(text, &v, 120); assert_eq!(4,4); assert_eq!(&r, ""); @@ -22,7 +28,7 @@ fn test_empty_key() { fn test_highlight_wrap() { let contents = "使用 git shallow clone 下载并编译 Thunderbird".to_string(); let v = vec![String::from("thunderbird")]; - let r = highlight_keywords_in_body(&contents, &v, 120); + let r = highlight_keywords_in_body_old_2024_apr(&contents, &v, 120); assert_eq!(&r, "使用 git shallow clone 下载并编译 Thunderbird"); } @@ -30,7 +36,7 @@ fn test_highlight_wrap() { fn test_highlight_latex() { let contents = "$\\vec{q_i}^T \\vec{a_j}, i