diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index ec1c184..ca97421 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -10,17 +10,15 @@ jobs: cargo-deny: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Install Rust + - uses: actions/checkout@v4 + - name: Install Rust uses: actions-rs/toolchain@v1 with: toolchain: stable profile: minimal override: true - name: Run sccache-cache - uses: Xuanwo/sccache-action@c94e27bef21ab3fb4a5152c8a878c53262b4abb0 - with: - version: "v0.4.0-pre.6" + uses: mozilla-actions/sccache-action@v0.0.5 - name: Get Date id: get-date run: | diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ac7d6cd..9fdfbd9 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -3,9 +3,9 @@ name: Linux and macOS # Template Reference: https://www.infinyon.com/blog/2021/04/github-actions-best-practices/ on: push: - branches: [ master ] + branches: [ master, llm_candidate ] pull_request: - branches: [ master ] + branches: [ master, llm_candidate ] env: CARGO_TERM_COLOR: always @@ -22,7 +22,9 @@ jobs: os: [ubuntu-latest, macos-13, macos-14] rust: [stable] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 + - name: Run sccache-cache + uses: mozilla-actions/sccache-action@v0.0.5 - name: Install Rust ${{ matrix.rust }} uses: actions-rs/toolchain@v1 with: @@ -30,9 +32,7 @@ jobs: profile: minimal override: true - name: Run sccache-cache - uses: mozilla-actions/sccache-action@c94e27bef21ab3fb4a5152c8a878c53262b4abb0 - with: - version: "v0.4.0-pre.6" + uses: mozilla-actions/sccache-action@v0.0.5 - name: Get Date id: get-date run: | @@ -74,7 +74,6 @@ jobs: - name: Run sccache stat for check shell: bash run: ${SCCACHE_PATH} --show-stats - release: @@ -89,9 +88,7 @@ jobs: profile: minimal override: true - name: Run sccache-cache - uses: Xuanwo/sccache-action@c94e27bef21ab3fb4a5152c8a878c53262b4abb0 - with: - version: "v0.4.0-pre.6" + uses: mozilla-actions/sccache-action@v0.0.5 - name: Get Date id: get-date run: | diff --git a/docs/examples.md b/docs/examples.md index c441d71..f71ff11 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -1,3 +1,9 @@ +## LLM + +2024 Sept 22 + +https://github.com/user-attachments/assets/b0a4ca66-0a33-401a-a916-af7a69f2ae7b + ## ObsidianMD [obsidian_example_2023-Feb-05.mp4]( diff --git a/docs/release_notes_0.2_2024Sep.md b/docs/release_notes_0.2_2024Sep.md new file mode 100644 index 0000000..5a71bd5 --- /dev/null +++ b/docs/release_notes_0.2_2024Sep.md @@ -0,0 +1,29 @@ +### 0.2.1 + +New feature: Note Summarization with Local LLM. + +What happens locally, what stays locally. + +#### Run server with local LLM +fireSeqSearch facilitates [llamafile](https://github.com/Mozilla-Ocho/llamafile) by [Mozilla](https://github.com/Mozilla-Ocho). + +``` +mkdir -pv ~/.llamafile && cd ~/.llamafile +wget https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q4_0.llamafile?download=true +chmod +x mistral-7b-instruct-v0.2.Q4_0.llamafile +``` + +After that, compile and run fireSeqSearch with LLM +``` +cargo build --features llm +target/debug/fire_seq_search_server --notebook_path ~/logseq +# Obsidian users +target/debug/fire_seq_search_server --notebook_path ~/obsidian --obsidian-md +``` + +Finally, update the [Firefox Addon](https://addons.mozilla.org/en-US/firefox/addon/fireseqsearch/). + +#### Demo Video +https://github.com/user-attachments/assets/b0a4ca66-0a33-401a-a916-af7a69f2ae7b + +This demo used [AstroWiki](https://github.com/AYelland/AstroWiki_2.0), which is licensed under MIT license. diff --git a/fireSeqSearch_addon/main.js b/fireSeqSearch_addon/main.js index 8b4c4a0..93df9f7 100644 --- a/fireSeqSearch_addon/main.js +++ b/fireSeqSearch_addon/main.js @@ -1,5 +1,5 @@ // MIT License -// Copyright (c) 2021-2023 Zhenbo Li +// Copyright (c) 2021-2024 Zhenbo Li const fireSeqSearchDomId = "fireSeqSearchDom"; @@ -128,79 +128,151 @@ function checkUserOptions() { ShowHighlight: res[2].ShowHighlight, ShowScore: res[3].ShowScore } - consoleLogForDebug(options); return options; }); } -async function appendResultToSearchResult(fetchResultArray, _container) { - const serverInfo = fetchResultArray[0]; - const rawSearchResult = fetchResultArray[1]; - const firefoxExtensionUserOption = await checkUserOptions(); +function parseRawList(rawSearchResult) { + const hits = []; + for (const rawRecord of rawSearchResult) { + const record = JSON.parse(rawRecord); + hits.push(record); + } + return hits; +} - consoleLogForDebug('Loaded user option: ' + JSON.stringify(firefoxExtensionUserOption)); +async function processLlmSummary(serverInfo, parsedSearchResult, fireDom) { + + const doneListApi = "http://127.0.0.1:3030/llm_done_list"; + let list = await fetch(doneListApi); + list = await list.text(); + list = JSON.parse(list); + + const findByTitle = function(title) { + const ul = fireDom.querySelector( ".fireSeqSearchHitList" ); + if (ul === null) return null; + for (const child of ul.children) { + const liTitle = child.firstChild.text; + if (title === liTitle) { + return child; + } + } + return null; + }; + const setLlmResult = function (title, llmSummary) { + const targetRow = findByTitle(title); + if (targetRow === null) { + consoleLogForDebug("Error! Can't find dom for ", title); + return; + } + if (targetRow.querySelector( ".fireSeqSearchLlmSummary" ) != null) { + consoleLogForDebug("Skip. We have the summary for ", title); + return; + } - function createTitleBarDom(count) { + const summary = createElementWithText("span", ""); + summary.innerHTML = llmSummary; + summary.classList.add('fireSeqSearchLlmSummary'); + targetRow.appendChild(summary); + }; + for (const record of parsedSearchResult) { + const title = record.title; + if (!list.includes(title)) { + consoleLogForDebug("Not ready, skip" + title); + continue; + } + // TODO remove hard code port + const llm_api = "http://127.0.0.1:3030/summarize/" + title; + let sum = await fetch(llm_api); + sum = await sum.text(); + setLlmResult(title, sum); + } +} + + +function createFireSeqDom(serverInfo, parsedSearchResult) { + const count = parsedSearchResult.length; + const div = document.createElement("div"); + div.setAttribute("id", fireSeqSearchDomId); + + const createTitleBarDom = function () { const titleBar = createElementWithText("div"); titleBar.classList.add('fireSeqSearchTitleBar'); const hitCount = `We found ${count.toString()} results in your logseq notebook`; titleBar.insertAdjacentHTML("afterbegin",hitCount); - const btn = document.createElement("button"); + + function setSummaryState(cl, state) { + let prop = 'none'; + if (state) { prop = ''; } + for (const el of document.querySelectorAll(cl)) { + el.style.display=prop; + } + } + let btn = document.createElement("button"); btn.classList.add("hideSummary"); - const text = document.createTextNode("Hide Summary (Tmp)"); + let text = document.createTextNode("Hide Summary"); btn.appendChild(text); btn.onclick = function () { - // alert("Button is clicked"); - for (const el of document.querySelectorAll('.fireSeqSearchHitSummary')) { - // el.style.visibility = 'hidden'; - el.remove(); - } + setSummaryState(".fireSeqSearchHitSummary", false); + setSummaryState(".fireSeqSearchLlmSummary", false); }; titleBar.appendChild(btn); - return titleBar; - } - function createFireSeqDom() { - const div = document.createElement("div"); - div.setAttribute("id", fireSeqSearchDomId); - return div; - } - - const dom = createFireSeqDom(); - dom.appendChild(createTitleBarDom(rawSearchResult.length)); - consoleLogForDebug(dom); - const hitList = document.createElement("ul"); + btn = document.createElement("button"); + btn.classList.add("showSummary"); + text = document.createTextNode("Summary"); + btn.appendChild(text); + btn.onclick = function () { + setSummaryState(".fireSeqSearchHitSummary", true); + setSummaryState(".fireSeqSearchLlmSummary", false); + }; + titleBar.appendChild(btn); - consoleLogForDebug(rawSearchResult); - for (const rawRecord of rawSearchResult) { - // const e = document.createTextNode(record); - consoleLogForDebug(rawRecord); - const record = JSON.parse(rawRecord); - consoleLogForDebug(typeof record); + btn = document.createElement("button"); + btn.classList.add("showLlm"); + text = document.createTextNode("LLM"); + btn.appendChild(text); + btn.onclick = function () { + setSummaryState(".fireSeqSearchHitSummary", false); + setSummaryState(".fireSeqSearchLlmSummary", true); + processLlmSummary(serverInfo, parsedSearchResult, div); + }; + titleBar.appendChild(btn); + return titleBar; + }; + const bar = createTitleBarDom(); + div.appendChild(bar); + return div; +} - const li = createElementWithText("li", ""); +async function appendResultToSearchResult(serverInfo, parsedSearchResult, dom) { + const firefoxExtensionUserOption = await checkUserOptions(); + consoleLogForDebug('Loaded user option: ' + JSON.stringify(firefoxExtensionUserOption)); + function buildListItems(parsedSearchResult) { + const hitList = document.createElement("ul"); + hitList.classList.add('fireSeqSearchHitList'); + for (const record of parsedSearchResult) { + const li = createElementWithText("li", ""); + li.classList.add('fireSeqSearchHitListItem'); + if (firefoxExtensionUserOption.ShowScore) { + const score = createElementWithText("span", String(record.score)); + li.appendChild(score); + } + const href = createHrefToLogseq(record, serverInfo); + li.appendChild(href); - if (firefoxExtensionUserOption.ShowScore) { - const score = createElementWithText("span", String(record.score)); - li.appendChild(score); - } - const href = createHrefToLogseq(record, serverInfo); - li.appendChild(href); - li.append(' ') - if (firefoxExtensionUserOption.ShowHighlight) { const summary = createElementWithText("span", ""); summary.innerHTML = record.summary; summary.classList.add('fireSeqSearchHitSummary'); li.appendChild(summary); - } - // let e = wrapRawRecordIntoElement(record, serverInfo); - // e.style. - hitList.appendChild(li); - // consoleLogForDebug("Added an element to the list"); + hitList.appendChild(li); + } + return hitList; } + const hitList = buildListItems(parsedSearchResult); dom.appendChild(hitList); if (firefoxExtensionUserOption.ExperimentalLayout) { @@ -228,6 +300,21 @@ async function appendResultToSearchResult(fetchResultArray, _container) { insertDivToWebpage(dom); } +async function mainProcess(fetchResultArray) { + consoleLogForDebug("main process"); + + const serverInfo = fetchResultArray[0]; + const rawSearchResult = fetchResultArray[1]; + consoleLogForDebug(serverInfo); + const parsedSearchResult = parseRawList(rawSearchResult); + + const fireDom = createFireSeqDom(serverInfo, parsedSearchResult); + + appendResultToSearchResult(serverInfo, parsedSearchResult, fireDom); + +} + + function getSearchParameterFromCurrentPage() { let searchParam = ""; @@ -259,7 +346,6 @@ function getSearchParameterFromCurrentPage() { (function() { const searchParameter = getSearchParameterFromCurrentPage(); - addGlobalStyle(fireSeqSearchScriptCSS); //https://gomakethings.com/waiting-for-multiple-all-api-responses-to-complete-with-the-vanilla-js-promise.all-method/ @@ -269,8 +355,7 @@ function getSearchParameterFromCurrentPage() { ]).then(function (responses) { return Promise.all(responses.map(function (response) {return response.json();})); }).then(function (data) { - consoleLogForDebug(data); - return appendResultToSearchResult(data); + mainProcess(data); }).then((_e) => { const highlightedItems = document.querySelectorAll('.fireSeqSearchHighlight'); consoleLogForDebug(highlightedItems); diff --git a/fireSeqSearch_addon/manifest.json b/fireSeqSearch_addon/manifest.json index d5b54d5..40bd71d 100644 --- a/fireSeqSearch_addon/manifest.json +++ b/fireSeqSearch_addon/manifest.json @@ -1,7 +1,7 @@ { "manifest_version": 2, "name": "fireSeqSearch", - "version": "0.1.4", + "version": "0.2.2", "description": "Everytime you use the search engine, this plugin will search against your personal logseq notes.", diff --git a/fire_seq_search_server/Cargo.toml b/fire_seq_search_server/Cargo.toml index 8e3b243..9dd0367 100644 --- a/fire_seq_search_server/Cargo.toml +++ b/fire_seq_search_server/Cargo.toml @@ -1,16 +1,20 @@ [package] name = "fire_seq_search_server" -version = "0.1.3" +version = "0.2.1" edition = "2021" license = "MIT" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + [dependencies] -# Http Client + tokio = { version = "1", features = ["full"] } -warp = "0.3" + +# Http Client +axum = "0.7.5" serde_json = "1.0" + # Serde # https://serde.rs/derive.html # https://stackoverflow.com/a/49313680/1166518 @@ -18,19 +22,21 @@ serde = { version = "1.0", features = ["derive", "rc"] } url = "2.3.1" # QueryEngine -tantivy = "0.18" +tantivy = "0.22" +tantivy-tokenizer-api = "0.3.0" +jieba-rs = { version = "0.7.0" } -log = "0.4.0" -env_logger = "0.9.0" +log = "0.4.22" +env_logger = "0.11.5" # Rust clap = { version = "4.0", features = ["derive"] } lazy_static = "1.4.0" rayon = "1.5" +futures = "0.3" urlencoding = "2.1.0" -jieba-rs = { version = "0.6.6" } # Language Processing @@ -39,7 +45,7 @@ stop-words = "0.7.2" regex = "1" lingua = { version = "1.4.0", default-features = false, features = ["chinese", "english"] } - +shellexpand = "3.1" #Highlight (Output) html-escape = "0.2.13" @@ -51,4 +57,19 @@ pulldown-cmark = { version = "0.9.2", default-features = false } #4: pdf_extract::show_text #at C:\Users\z2369li\.cargo\git\checkouts\pdf-extract-c67a6fa67c2d526c\0d8b9d9\src\lib.rs:1262:16 #pdf-extract = "0.6.4" -pdf-extract-temporary-mitigation-panic = "0.7.1" \ No newline at end of file +pdf-extract-temporary-mitigation-panic = "0.7.1" + + + +# TODO Currently turn them off will make cargo build fail +# I should make these deps optional, so those who doesn't want LLM could have a smaller binary +sha256 = { version = "1.5.0", optional = true } +reqwest = { version = "0.12", features = ["json"], optional = false } +serde_derive = { version = "1.0.209", optional = false} + +[features] +#default = ["llm"] +llm = ["sha256", + #"serde_derive", + #"request" +] diff --git a/fire_seq_search_server/debug_server.sh b/fire_seq_search_server/debug_server.sh index a5e7aad..ca8b8f8 100644 --- a/fire_seq_search_server/debug_server.sh +++ b/fire_seq_search_server/debug_server.sh @@ -1,8 +1,10 @@ set -e -rm -f ./fire_seq_search_server +rm -f ./fire_seq_search_server # nix-shell -p cargo -p rustc -p libiconv --run "cargo build" cargo build cp target/debug/fire_seq_search_server ./fire_seq_search_server -RUST_BACKTRACE=1 RUST_LOG=debug ./fire_seq_search_server \ ---notebook_path ~/logseq --enable-journal-query +export RUST_LOG="warn,fire_seq_search_server=info" +#export RUST_LOG="debug" +export RUST_BACKTRACE=1 +./fire_seq_search_server --notebook_path ~/logseq --enable-journal-query diff --git a/fire_seq_search_server/debug_server_mac.sh b/fire_seq_search_server/debug_server_mac.sh new file mode 100644 index 0000000..6438843 --- /dev/null +++ b/fire_seq_search_server/debug_server_mac.sh @@ -0,0 +1,11 @@ +set -e +rm -f ./fire_seq_search_server +#nix-shell -p cargo -p rustc -p libiconv --run "cargo build" +cargo build --features llm +cp target/debug/fire_seq_search_server ./fire_seq_search_server + +export RUST_LOG="warn,fire_seq_search_server=info" +#export RUST_LOG="debug" +export RUST_BACKTRACE=1 +./fire_seq_search_server --notebook_path ~/logseq +#--enable-journal-query diff --git a/fire_seq_search_server/deny.toml b/fire_seq_search_server/deny.toml index eafa3bf..944f728 100644 --- a/fire_seq_search_server/deny.toml +++ b/fire_seq_search_server/deny.toml @@ -1,129 +1,30 @@ -# This template contains all of the possible sections and their default values - -# Note that all fields that take a lint level have these possible values: -# * deny - An error will be produced and the check will fail -# * warn - A warning will be produced, but the check will not fail -# * allow - No warning or error will be produced, though in some cases a note -# will be - -# The values provided in this template are the default values that will be used -# when any section or field is not specified in your own configuration - -# Root options - -# If 1 or more target triples (and optionally, target_features) are specified, -# only the specified targets will be checked when running `cargo deny check`. -# This means, if a particular package is only ever used as a target specific -# dependency, such as, for example, the `nix` crate only being used via the -# `target_family = "unix"` configuration, that only having windows targets in -# this list would mean the nix crate, as well as any of its exclusive -# dependencies not shared by any other crates, would be ignored, as the target -# list here is effectively saying which targets you are building for. +[graph] targets = [ - # The triple can be any string, but only the target triples built in to - # rustc (as of 1.40) can be checked against actual config expressions - #{ triple = "x86_64-unknown-linux-musl" }, - # You can also specify which target_features you promise are enabled for a - # particular target. target_features are currently not validated against - # the actual valid features supported by the target architecture. - #{ triple = "wasm32-unknown-unknown", features = ["atomics"] }, ] -# When creating the dependency graph used as the source of truth when checks are -# executed, this field can be used to prune crates from the graph, removing them -# from the view of cargo-deny. This is an extremely heavy hammer, as if a crate -# is pruned from the graph, all of its dependencies will also be pruned unless -# they are connected to another crate in the graph that hasn't been pruned, -# so it should be used with care. The identifiers are [Package ID Specifications] -# (https://doc.rust-lang.org/cargo/reference/pkgid-spec.html) -#exclude = [] -# If true, metadata will be collected with `--all-features`. Note that this can't -# be toggled off if true, if you want to conditionally enable `--all-features` it -# is recommended to pass `--all-features` on the cmd line instead all-features = false -# If true, metadata will be collected with `--no-default-features`. The same -# caveat with `all-features` applies no-default-features = false -# If set, these feature will be enabled when collecting metadata. If `--features` -# is specified on the cmd line they will take precedence over this option. -#features = [] -# When outputting inclusion graphs in diagnostics that include features, this -# option can be used to specify the depth at which feature edges will be added. -# This option is included since the graphs can be quite large and the addition -# of features from the crate(s) to all of the graph roots can be far too verbose. -# This option can be overridden via `--feature-depth` on the cmd line + +[output] feature-depth = 1 -# This section is considered when running `cargo deny check advisories` -# More documentation for the advisories section can be found here: -# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html [advisories] -# The path where the advisory database is cloned/fetched into -db-path = "~/.cargo/advisory-db" -# The url(s) of the advisory databases to use -db-urls = ["https://github.com/rustsec/advisory-db"] -# The lint level for security vulnerabilities -vulnerability = "deny" -# The lint level for unmaintained crates -unmaintained = "warn" -# The lint level for crates that have been yanked from their source registry -yanked = "warn" -# The lint level for crates with security notices. Note that as of -# 2019-12-17 there are no security notice advisories in -# https://github.com/rustsec/advisory-db -notice = "warn" -# A list of advisory IDs to ignore. Note that ignored advisories will still -# output a note when they are encountered. +# Not finished ignore = [ - #"RUSTSEC-0000-0000", + { id = "RUSTSEC-2020-0056", reason = "pdf extract" }, + { id = "RUSTSEC-2021-0153", reason = "pdf" }, ] -# Threshold for security vulnerabilities, any vulnerability with a CVSS score -# lower than the range specified will be ignored. Note that ignored advisories -# will still output a note when they are encountered. -# * None - CVSS Score 0.0 -# * Low - CVSS Score 0.1 - 3.9 -# * Medium - CVSS Score 4.0 - 6.9 -# * High - CVSS Score 7.0 - 8.9 -# * Critical - CVSS Score 9.0 - 10.0 -#severity-threshold = -# If this is true, then cargo deny will use the git executable to fetch advisory database. -# If this is false, then it uses a built-in git library. -# Setting this to true can be helpful if you have special authentication requirements that cargo-deny does not support. -# See Git Authentication for more information about setting up git authentication. -#git-fetch-with-cli = true -# This section is considered when running `cargo deny check licenses` -# More documentation for the licenses section can be found here: -# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html [licenses] -# The lint level for crates which do not have a detectable license -unlicensed = "warn" # List of explicitly allowed licenses # See https://spdx.org/licenses/ for list of possible licenses # [possible values: any SPDX 3.11 short identifier (+ optional exception)]. allow = [ + "MIT", "Apache-2.0", + "BSD-2-Clause", "BSD-3-Clause", "CC0-1.0", + "MPL-2.0", ] -# List of explicitly disallowed licenses -# See https://spdx.org/licenses/ for list of possible licenses -# [possible values: any SPDX 3.11 short identifier (+ optional exception)]. -deny = [ - #"Nokia", -] -# Lint level for licenses considered copyleft -copyleft = "warn" -# Blanket approval or denial for OSI-approved or FSF Free/Libre licenses -# * both - The license will be approved if it is both OSI-approved *AND* FSF -# * either - The license will be approved if it is either OSI-approved *OR* FSF -# * osi-only - The license will be approved if is OSI-approved *AND NOT* FSF -# * fsf-only - The license will be approved if is FSF *AND NOT* OSI-approved -# * neither - This predicate is ignored and the default lint level is used -allow-osi-fsf-free = "both" -# Lint level used when no other predicates are matched -# 1. License isn't in the allow or deny lists -# 2. License isn't copyleft -# 3. License isn't OSI/FSF, or allow-osi-fsf-free = "neither" -default = "deny" # The confidence threshold for detecting a license from license text. # The higher the value, the more closely the license text must be to the # canonical license text of a valid SPDX license file. @@ -132,45 +33,10 @@ confidence-threshold = 0.8 # Allow 1 or more licenses on a per-crate basis, so that particular licenses # aren't accepted for every possible crate as with the normal allow list exceptions = [ - # Each entry is the crate and version constraint, and its specific allow - # list { name = "fastdivide", allow = ["zlib-acknowledgement"] }, { name = "unicode-ident", allow = ["Unicode-DFS-2016"] }, ] -# Some crates don't have (easily) machine readable licensing information, -# adding a clarification entry for it allows you to manually specify the -# licensing information -#[[licenses.clarify]] -# The name of the crate the clarification applies to -#name = "ring" -# The optional version constraint for the crate -#version = "*" -# The SPDX expression for the license requirements of the crate -#expression = "MIT AND ISC AND OpenSSL" -# One or more files in the crate's source used as the "source of truth" for -# the license expression. If the contents match, the clarification will be used -# when running the license check, otherwise the clarification will be ignored -# and the crate will be checked normally, which may produce warnings or errors -# depending on the rest of your configuration -#license-files = [ - # Each entry is a crate relative path, and the (opaque) hash of its contents - #{ path = "LICENSE", hash = 0xbd0eed23 } -#] - -[licenses.private] -# If true, ignores workspace crates that aren't published, or are only -# published to private registries. -# To see how to mark a crate as unpublished (to the official registry), -# visit https://doc.rust-lang.org/cargo/reference/manifest.html#the-publish-field. -ignore = false -# One or more private registries that you might publish crates to, if a crate -# is only published to private registries, and ignore is true, the crate will -# not have its license(s) checked -registries = [ - #"https://sekretz.com/registry -] - # This section is considered when running `cargo deny check bans`. # More documentation about the 'bans' section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html @@ -186,33 +52,32 @@ wildcards = "allow" # * all - Both lowest-version and simplest-path are used highlight = "all" # The default lint level for `default` features for crates that are members of -# the workspace that is being checked. This can be overriden by allowing/denying +# the workspace that is being checked. This can be overridden by allowing/denying # `default` on a crate-by-crate basis if desired. workspace-default-features = "allow" # The default lint level for `default` features for external crates that are not -# members of the workspace. This can be overriden by allowing/denying `default` +# members of the workspace. This can be overridden by allowing/denying `default` # on a crate-by-crate basis if desired. external-default-features = "allow" # List of crates that are allowed. Use with care! allow = [ - #{ name = "ansi_term", version = "=0.11.0" }, + #"ansi_term@0.11.0", + #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is allowed" }, ] # List of crates to deny deny = [ - # Each entry the name of a crate and a version range. If version is - # not specified, all versions will be matched. - #{ name = "ansi_term", version = "=0.11.0" }, - # + #"ansi_term@0.11.0", + #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is banned" }, # Wrapper crates can optionally be specified to allow the crate when it # is a direct dependency of the otherwise banned crate - #{ name = "ansi_term", version = "=0.11.0", wrappers = [] }, + #{ crate = "ansi_term@0.11.0", wrappers = ["this-crate-directly-depends-on-ansi_term"] }, ] # List of features to allow/deny # Each entry the name of a crate and a version range. If version is # not specified, all versions will be matched. #[[bans.features]] -#name = "reqwest" +#crate = "reqwest" # Features to not allow #deny = ["json"] # Features to allow @@ -233,14 +98,16 @@ deny = [ # Certain crates/versions that will be skipped when doing duplicate detection. skip = [ - #{ name = "ansi_term", version = "=0.11.0" }, + #"ansi_term@0.11.0", + #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason why it can't be updated/removed" }, ] # Similarly to `skip` allows you to skip certain crates during duplicate # detection. Unlike skip, it also includes the entire tree of transitive # dependencies starting at the specified crate, up to a certain depth, which is # by default infinite. skip-tree = [ - #{ name = "ansi_term", version = "=0.11.0", depth = 20 }, + #"ansi_term@0.11.0", # will be skipped along with _all_ of its direct and transitive dependencies + #{ crate = "ansi_term@0.11.0", depth = 20 }, ] # This section is considered when running `cargo deny check sources`. @@ -259,10 +126,3 @@ allow-registry = ["https://github.com/rust-lang/crates.io-index"] # List of URLs for allowed Git repositories allow-git = [] -[sources.allow-org] -# 1 or more github.com organizations to allow git sources for -github = [""] -# 1 or more gitlab.com organizations to allow git sources for -gitlab = [""] -# 1 or more bitbucket.org organizations to allow git sources for -bitbucket = [""] diff --git a/fire_seq_search_server/obsidian.sh b/fire_seq_search_server/obsidian.sh old mode 100644 new mode 100755 index 61e9d91..823317e --- a/fire_seq_search_server/obsidian.sh +++ b/fire_seq_search_server/obsidian.sh @@ -1,8 +1,10 @@ set -e -cargo build +cargo build --features llm rm ./fire_seq_search_server -f -cp --force target/debug/fire_seq_search_server.exe ./fire_seq_search_server +cp --force target/debug/fire_seq_search_server ./fire_seq_search_server + +NOTEBOOK_NAME=AstroWiki_2.0-main RUST_BACKTRACE=1 RUST_LOG=debug ./fire_seq_search_server \ ---notebook_path /c/Users/z2369li/Documents/graph-note-of-greek-myth/希腊神话 \ ---obsidian-md + --notebook_path ~/Documents/$NOTEBOOK_NAME \ + --obsidian-md diff --git a/fire_seq_search_server/src/http_client/endpoints.rs b/fire_seq_search_server/src/http_client/endpoints.rs index 2d03cbf..0f40899 100644 --- a/fire_seq_search_server/src/http_client/endpoints.rs +++ b/fire_seq_search_server/src/http_client/endpoints.rs @@ -1,23 +1,48 @@ use std::sync::Arc; -use log::debug; -use crate::query_engine::QueryEngine; -use serde_json; +use log::{debug}; -pub fn get_server_info(engine_arc: Arc) -> String { - serde_json::to_string( &engine_arc.server_info ).unwrap() +use crate::query_engine::{QueryEngine, ServerInformation}; +use axum::Json; +use axum::extract::State; +use axum::{response::Html, extract::Path}; + +pub async fn get_server_info(State(engine_arc): State>) + -> Json { + axum::Json( engine_arc.server_info.to_owned() ) } -pub fn query(term: String, engine_arc: Arc) - -> String { + +pub async fn query( + Path(term) : Path, + State(engine_arc): State> + ) -> Html{ debug!("Original Search term {}", term); - engine_arc.query_pipeline(term) + let r = engine_arc.query_pipeline(term); + Html(r.await) } +pub async fn summarize( + Path(title) : Path, + State(engine_arc): State> + ) -> Html{ + + let r = engine_arc.summarize(title); + Html(r.await) +} -pub fn generate_word_cloud(engine_arc: Arc) -> String { +pub async fn get_llm_done_list( + State(engine_arc): State> + ) -> Html{ + let r = engine_arc.get_llm_done_list(); + Html(r.await) +} + +pub async fn generate_word_cloud(State(engine_arc): State>) + -> Html { let div_id = "fireSeqSearchWordcloudRawJson"; let json = engine_arc.generate_wordcloud(); let div = format!("
{}
", div_id, json); - div + Html(div) } + diff --git a/fire_seq_search_server/src/language_tools/tokenizer.rs b/fire_seq_search_server/src/language_tools/tokenizer.rs index 31377a2..7cac6d9 100644 --- a/fire_seq_search_server/src/language_tools/tokenizer.rs +++ b/fire_seq_search_server/src/language_tools/tokenizer.rs @@ -21,12 +21,16 @@ pub fn filter_out_stopwords<'a,'b>(term_tokens: &'a [String], nltk: &'b HashSet< pub fn tokenize(sentence: &str) -> Vec { + /* lazy_static! { static ref TK: crate::JiebaTokenizer = crate::JiebaTokenizer {}; } + */ if crate::language_tools::is_chinese(sentence) { info!("Use Tokenizer for Chinese term {}", sentence); - crate::tokenize_sentence_to_text_vec(&TK, sentence) + let mut jieba = FireSeqTokenizer {}; + //TODO don't create a tokenizer every time + crate::tokenize_sentence_to_text_vec(&mut jieba, sentence) } else { // info!("Space Tokenizer {}", sentence); let result : Vec<&str> = sentence.split_whitespace() @@ -36,4 +40,77 @@ pub fn tokenize(sentence: &str) -> Vec { result // vec![String::from(sentence)] } -} \ No newline at end of file +} + +use lazy_static::lazy_static; +use tantivy_tokenizer_api::{Token, TokenStream, Tokenizer}; + +lazy_static! { + static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new(); +} + +pub const TOKENIZER_ID: &str = "fireseq_tokenizer"; + +#[derive(Clone)] +pub struct FireSeqTokenizer; + + + +pub struct JiebaTokenStream { + tokens: Vec, + index: usize, +} + +impl TokenStream for JiebaTokenStream { + fn advance(&mut self) -> bool { + if self.index < self.tokens.len() { + self.index = self.index + 1; + true + } else { + false + } + } + fn token(&self) -> &Token { + &self.tokens[self.index - 1] + } + fn token_mut(&mut self) -> &mut Token { + &mut self.tokens[self.index - 1] + } +} + +impl Tokenizer for FireSeqTokenizer { + type TokenStream<'a> = JiebaTokenStream; + fn token_stream<'a>(&mut self, text: &'a str) -> JiebaTokenStream { + let mut indices = text.char_indices().collect::>(); + indices.push((text.len(), '\0')); + let orig_tokens = JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, true); + let mut tokens = Vec::new(); + // copy tantivy-jieba code for now + for token in orig_tokens { + tokens.push(Token { + offset_from: indices[token.start].0, + offset_to: indices[token.end].0, + position: token.start, + text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]), + position_length: token.end - token.start, + }); + } + /* + for i in 0..orig_tokens.len() { + let token = &orig_tokens[i]; + match process_token_text(text, &indices, &token) { + Some(text) => tokens.push(Token { + offset_from: indices[token.start].0, + offset_to: indices[token.end].0, + position: token.start, + text, + position_length: token.end - token.start, + }), + None => () + } + + } + */ + JiebaTokenStream { tokens, index: 0 } + } +} diff --git a/fire_seq_search_server/src/lib.rs b/fire_seq_search_server/src/lib.rs index c241ddf..bc397b3 100644 --- a/fire_seq_search_server/src/lib.rs +++ b/fire_seq_search_server/src/lib.rs @@ -5,10 +5,12 @@ pub mod language_tools; pub mod http_client; pub mod query_engine; pub mod word_frequency; +pub mod local_llm; -use log::{debug, info}; +use log::debug; use crate::query_engine::ServerInformation; +use crate::query_engine::NotebookSoftware::Logseq; #[macro_use] @@ -18,6 +20,7 @@ pub static JOURNAL_PREFIX: &str = "@journal@"; pub struct Article { + #[allow(dead_code)] /* TODO rethink if we need it 2024 Sep 21 */ file_name: String, content: String } @@ -25,47 +28,23 @@ pub struct Article { // Based on https://github.com/jiegec/tantivy-jieba // tantivy-jieba is licensed under MIT, Copyright 2019-2020 Jiajie Chen // I had heavy modifications on it +/* lazy_static! { static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new(); } +*/ -pub const TOKENIZER_ID: &str = "fss_tokenizer"; - -use tantivy::tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer}; - -pub struct JiebaTokenStream { - tokens: Vec, - index: usize, -} - - -#[derive(Clone)] -pub struct JiebaTokenizer; - -impl TokenStream for JiebaTokenStream { - fn advance(&mut self) -> bool { - if self.index < self.tokens.len() { - self.index = self.index + 1; - true - } else { - false - } - } - - fn token(&self) -> &Token { - &self.tokens[self.index - 1] - } +//pub const TOKENIZER_ID: &str = "fss_tokenizer"; - fn token_mut(&mut self) -> &mut Token { - &mut self.tokens[self.index - 1] - } -} +/* impl Tokenizer for JiebaTokenizer { - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + type TokenStream<'a> = JiebaTokenStream; + fn token_stream<'a>(&mut self, text: &'a str) -> JiebaTokenStream { let mut indices = text.char_indices().collect::>(); indices.push((text.len(), '\0')); - let orig_tokens = JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, true); + let jieba : jieba_rs::Jieba = jieba_rs::Jieba::new(); //TODO use a static one + let orig_tokens = jieba.tokenize(text, jieba_rs::TokenizeMode::Search, true); let mut tokens = Vec::new(); for i in 0..orig_tokens.len() { let token = &orig_tokens[i]; @@ -81,9 +60,11 @@ impl Tokenizer for JiebaTokenizer { } } - BoxTokenStream::from(JiebaTokenStream { tokens, index: 0 }) + JiebaTokenStream { tokens, index: 0 } + } } +*/ /* Thoughts on lowercase 2022-07-04: @@ -93,7 +74,6 @@ tanvity's default tokenizer will lowercase all English characters. However, I think there could be a better approach 1. use https://github.com/pemistahl/lingua-rs to determine the language of the text 2. Select proper tokenizer - */ fn process_token_text(text: &str, indices: &Vec<(usize, char)>, token: &jieba_rs::Token<'_>) -> Option { let raw = String::from(&text[(indices[token.start].0)..(indices[token.end].0)]); let lower = raw.to_lowercase(); @@ -103,15 +83,27 @@ fn process_token_text(text: &str, indices: &Vec<(usize, char)>, token: &jieba_rs Some(lower) } } + */ +// TODO use stub now +pub fn tokenize_default(sentence: &str) -> Vec { + let mut r = Vec::new(); + r.push(sentence.to_owned()); + r +} +/* // TODO: Move tokenizer-related things into language_tools pub fn tokenize_default(sentence: &str) -> Vec { + /* lazy_static! { static ref TK: JiebaTokenizer = crate::JiebaTokenizer {}; } + */ + // TODO use static tokenizer + let mut tokenizer = crate::JiebaTokenizer{}; if language_tools::is_chinese(sentence) { info!("Use Tokenizer for Chinese term {}", sentence); - tokenize_sentence_to_text_vec(&TK, sentence) + tokenize_sentence_to_text_vec(&mut tokenizer, sentence) } else { // info!("Space Tokenizer {}", sentence); let result : Vec<&str> = sentence.split_whitespace() @@ -122,13 +114,15 @@ pub fn tokenize_default(sentence: &str) -> Vec { // vec![String::from(sentence)] } } +*/ -pub fn tokenize_sentence_to_text_vec(tokenizer: &JiebaTokenizer, sentence: &str) -> Vec { - let tokens = tokenize_sentence_to_vector(&tokenizer, sentence); +use crate::language_tools::tokenizer::FireSeqTokenizer; +pub fn tokenize_sentence_to_text_vec(tokenizer: &mut FireSeqTokenizer, sentence: &str) -> Vec { + let tokens = tokenize_sentence_to_vector(tokenizer, sentence); tokens_to_text_vec(&tokens) } -pub fn tokenize_sentence_to_vector(tokenizer: &JiebaTokenizer, sentence: &str) -> Vec { +pub fn tokenize_sentence_to_vector(tokenizer: &mut FireSeqTokenizer, sentence: &str) -> Vec { use tantivy::tokenizer::*; let mut token_stream = tokenizer.token_stream( sentence @@ -176,12 +170,15 @@ pub fn generate_server_info_for_test() -> ServerInformation { show_summary_single_line_chars_limit: 0, parse_pdf_links: false, exclude_zotero_items: false, - obsidian_md: false, - convert_underline_hierarchy: true + software: Logseq, + convert_underline_hierarchy: true, + host: "127.0.0.1:22024".to_string(), + llm_enabled: false, }; server_info } +/* #[cfg(test)] mod test_tokenizer { #[test] @@ -249,3 +246,5 @@ mod test_tokenizer { } +*/ + diff --git a/fire_seq_search_server/src/load_notes/mod.rs b/fire_seq_search_server/src/load_notes/mod.rs index 2c2d027..9ac794c 100644 --- a/fire_seq_search_server/src/load_notes/mod.rs +++ b/fire_seq_search_server/src/load_notes/mod.rs @@ -1,139 +1,102 @@ -use std::fs::DirEntry; -use log::{debug, error, info, warn}; +use log::{debug, error, info}; use std::process; -use rayon::prelude::*; use crate::query_engine::ServerInformation; -use crate::JOURNAL_PREFIX; -pub fn read_all_notes(server_info: &ServerInformation) -> Vec<(String, String)> { - // I should remove the unwrap and convert it into map - let path: &str = &server_info.notebook_path; - let path = path.to_owned(); - let pages_path = if server_info.obsidian_md { - path.clone() - } else{ - path.clone() + "/pages" - }; - - - let mut pages: Vec<(String, String)> = Vec:: new(); +use std::borrow::Cow; +use std::borrow::Borrow; - let pages_tmp: Vec<(String, String)> = read_specific_directory(&pages_path).par_iter() - .map(|(title,md)| { - let content = crate::markdown_parser::parse_logseq_notebook(md, title, server_info); - (title.to_string(), content) - }).collect(); //silly collect. +#[derive(Debug, Clone)] +pub struct NoteListItem { + pub realpath: String, + pub title: String, +} - // TODO: Silly filter - for (file_name, contents) in pages_tmp { - // info!("File Name: {}", &file_name); - if server_info.exclude_zotero_items && file_name.starts_with('@') { - continue; - } - pages.push((file_name,contents)); - } - if server_info.enable_journal_query { - info!("Loading journals"); - let journals_page = path.clone() + "/journals"; - let journals:Vec<(String, String)> - = read_specific_directory(&journals_page).par_iter() - .map(|(title,md)| { - let content = crate::markdown_parser::parse_logseq_notebook(md, title, server_info); - let tantivy_title = JOURNAL_PREFIX.to_owned() + &title; - (tantivy_title, content) - }).collect(); //silly collect. - - - for (file_name, contents) in journals { - pages.push((file_name,contents)); - } +use crate::query_engine::NotebookSoftware; +pub fn retrive_note_list(server_info: &ServerInformation) -> Vec { + let path: &str = &server_info.notebook_path; - } + let note_list = match &server_info.software { + NotebookSoftware::Obsidian => list_directory( Cow::from(path) , true), + NotebookSoftware::Logseq => { + let pp = path.to_string() + "/pages"; + let mut pages = list_directory( Cow::from(pp), false ); - pages + // TODO Journal prefix + let pp = path.to_string() + "/journals"; + let jours = list_directory( Cow::from(pp), false ); + pages.extend(jours); + pages + }, + }; + // TODO didn't handle logseq + note_list } -pub fn read_specific_directory(path: &str) -> Vec<(String, String)> { - info!("Try to read {}", &path); - let notebooks = match std::fs::read_dir(path) { +fn list_directory(path: Cow<'_, str>, recursive: bool) -> Vec { + debug!("Listing directory {}", &path); + let mut result = Vec::new(); + + let path_ref: &str = path.borrow(); + let notebooks = match std::fs::read_dir(path_ref) { Ok(x) => x, Err(e) => { - error!("Fatal error ({:?}) when reading {}", e, path); + error!("Fatal error ({:?}) when reading {}", e, &path); process::abort(); } }; - let mut note_filenames: Vec = Vec::new(); - for note in notebooks { - let note : DirEntry = note.unwrap(); - note_filenames.push(note); - } - // debug!("Note titles: {:?}", ¬e_filenames); - let result: Vec<(String,String)> = note_filenames.par_iter() - .map(|note| read_md_file_wo_parse(¬e)) - .filter(|x| (&x).is_some()) - .map(|x| x.unwrap()) - .collect(); - info!("Loaded {} notes from {}", result.len(), path); - // info!("After map {:?}", &result); - - result -} - + for note_result in notebooks { + let entry = match note_result { + Ok(x) => x, + Err(e) => { + error!("Error during looping {:?}", &e); + continue; + } + }; + let file_type = match entry.file_type() { + Ok(x) => x, + Err(e) => { + error!("Error: Can't get file type {:?} {:?}", &entry, &e); + continue; + } + }; + let entry_path = entry.path(); + let entry_path_str = entry_path.to_string_lossy(); -/// -/// -/// # Arguments -/// -/// * `note`: -/// -/// returns: Option<(String, String)> -/// -/// First: title (filename) -/// Second: full raw text -/// -/// I would delay the parsing job, so it could be couples with server info. -Zhenbo Li 2023-02-17 -/// If input is a directory or DS_STORE, return None -/// -pub fn read_md_file_wo_parse(note: &std::fs::DirEntry) -> Option<(String, String)> { - if let Ok(file_type) = note.file_type() { - // Now let's show our entry's file type! - debug!("{:?}: {:?}", note.path(), file_type); if file_type.is_dir() { - debug!("{:?} is a directory, skipping", note.path()); - return None; + if recursive { + let next = list_directory(entry_path_str, true); + result.extend(next); + } + continue; } - } else { - warn!("Couldn't get file type for {:?}", note.path()); - return None; - } - let note_path = note.path(); - let note_title = match note_path.file_stem() { - Some(osstr) => osstr.to_str().unwrap(), - None => { - error!("Couldn't get file_stem for {:?}", note.path()); - return None; + if !entry_path_str.ends_with(".md") { + info!("skip non-md file {:?}", &entry); + continue; } - }; - debug!("note title: {}", ¬e_title); - let content : String = match std::fs::read_to_string(¬e_path) { - Ok(c) => c, - Err(e) => { - if note_title.to_lowercase() == ".ds_store" { - debug!("Ignore .DS_Store for mac"); - } else { - error!("Error({:?}) when reading the file {:?}", e, note_path); + let note_title = match entry_path.file_stem() { + Some(osstr) => osstr.to_str().unwrap(), + None => { + error!("Couldn't get file_stem for {:?}", entry_path); + continue; } - return None; - } - }; - - Some((note_title.to_string(),content)) + }; + let row = NoteListItem { + realpath: entry_path_str.to_string(), + title: note_title.to_string(), + }; + result.push(row); + } + return result; } + + + + diff --git a/fire_seq_search_server/src/local_llm/example_llama_response.json b/fire_seq_search_server/src/local_llm/example_llama_response.json new file mode 100644 index 0000000..d18cb0e --- /dev/null +++ b/fire_seq_search_server/src/local_llm/example_llama_response.json @@ -0,0 +1,21 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "content": " It seems like there might be some confusion in your question. \"MS file format\" typically refers to the Microsoft Office document file formats, such as .docx, .xlsx, and .pptx.\n\nHowever, if you meant to ask about the WIF file format, then here's some information for you:\n\nWIF (Windows Image File) is not a widely used file format. It is a proprietary file format used by Microsoft's Windows Imaging Component (WIC) for storing and manipulating image data. WIF files can contain multiple images, each with its own metadata, and can be used for tasks such as image processing, thumbnail generation, and icon extraction.\n\nWIF files are not meant to be opened or edited by users directly, but rather are used as input and output files for applications that use the WIC API. If you need to work with WIF files, you would typically use a programming language and the WIC API to read and write the files.\n\nI hope this information helps clarify any confusion around the MS file format and the WIF file format. Let me know if you have any other questions!", + "role": "assistant" + } + } + ], + "created": 1724517653, + "id": "chatcmpl-4B", + "model": "model", + "object": "chat.completion", + "usage": { + "completion_tokens": 247, + "prompt_tokens": 14, + "total_tokens": 261 + } +} diff --git a/fire_seq_search_server/src/local_llm/mod.rs b/fire_seq_search_server/src/local_llm/mod.rs new file mode 100644 index 0000000..9e8673d --- /dev/null +++ b/fire_seq_search_server/src/local_llm/mod.rs @@ -0,0 +1,319 @@ +use log::{info, error}; +use crate::query_engine::DocData; + +use std::collections::HashMap; +use std::collections::VecDeque; +use std::process::{Command, Stdio}; +use std::fs::File; + +use std::sync::Arc; +use tokio::sync::Mutex; +use tokio::task::yield_now; +use tokio::task; +use tokio::time; + +use std::borrow::Cow; +use std::borrow::Cow::Borrowed; + + +//#[cfg(feature = "llm")] +use { + reqwest, + reqwest::StatusCode, + shellexpand::tilde, + + serde_derive::Deserialize, + serde_derive::Serialize, +}; + + + +// TODO Allow user to set prompt, instead of hard-coded in code +const HARD_CODED_PROMPT_STR: &'static str = r##" +You are a seasoned summary expert, capable of condensing and summarizing given articles, papers, or posts, accurately conveying the main idea to make the content easier to understand. + +You place great emphasis on user experience, never adding irrelevant content like "Summary," "The summary is as follows," "Original text," "You can check the original text if interested," or "Original link." Your summaries always convey the core information directly. + +You are adept at handling various large, small, and even chaotic text content, always accurately extracting key information and summarizing the core content globally to make it easier to understand. + +=== Below is the article === + +"##; + +// Generated by https://transform.tools/json-to-rust-serde +#[derive(Debug, Serialize, Deserialize)] +pub struct OpenAiData { + pub model: String, + pub messages: Vec, +} + +#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct LlamaResponse { + pub choices: Vec, + pub created: i64, + pub id: String, + pub model: String, + pub object: String, + pub usage: Usage, +} + +#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Choice { + pub finish_reason: String, + pub index: i64, + pub message: Message, +} + +#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Message { + pub content: String, + pub role: String, +} + +#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Usage { + pub completion_tokens: i64, + pub prompt_tokens: i64, + pub total_tokens: i64, +} + +#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct HealthCheck { + pub slots_idle: i64, + pub slots_processing: i64, + pub status: String, +} + +// End genereated + +const LLM_SERVER_PORT: &str = "8081"; // TODO Remove this magic number + +struct JobProcessor { + done_job: HashMap, + job_queue: VecDeque, +} + +impl JobProcessor { + pub fn new() -> Self { + JobProcessor { + done_job: HashMap::new(), + job_queue: VecDeque::new(), + } + } + pub fn add(&mut self, doc:DocData) { + let title: &str = &doc.title; + info!("Job posted for {}", &title); + if !self.done_job.contains_key(title) { + self.job_queue.push_back(doc); + } + } +} + +pub struct LlmEngine { + endpoint: String, + client: reqwest::Client, + job_cache: Arc>, + //job_cache :Arc >>>, +} + + + +impl LlmEngine { + pub async fn llm_init() -> Self { + info!("llm called"); + + let lfile = locate_llamafile().await; + let lfile:String = lfile.unwrap(); + + let _cmd = Command::new("sh") + .args([ &lfile, "--nobrowser", + "--port", LLM_SERVER_PORT, + //">/tmp/llamafile.stdout", "2>/tmp/llamafile.stderr", + ]) + .stdout(Stdio::from(File::create("/tmp/llamafile.stdout.txt").unwrap())) + .stderr(Stdio::from(File::create("/tmp/llamafile.stderr.txt").unwrap())) + .spawn() + .expect("llm model failed to launch"); + + yield_now().await; + let wait_llm = time::Duration::from_millis(500); + tokio::time::sleep(wait_llm).await; + task::yield_now().await; + + let endpoint = format!("http://127.0.0.1:{}", LLM_SERVER_PORT).to_string(); + + + loop { + let resp = reqwest::get(endpoint.to_owned() + "/health").await; + let resp = match resp { + Err(_e) => { + info!("llm not ready"); + let wait_llm = time::Duration::from_millis(1000); + tokio::time::sleep(wait_llm).await; + task::yield_now().await; + continue; + }, + Ok(r) => r, + }; + if resp.status() != StatusCode::from_u16(200).unwrap() { + info!("endpoint failed"); + //TODO error handling + } + break; + } + + let client = reqwest::Client::new(); + + info!("llm engine initialized"); + let map = Arc::new(Mutex::new( + JobProcessor::new())); + Self { + endpoint, + client, + job_cache: map + } + } + + fn build_data(full_text: Cow<'_, str>) -> OpenAiData { + + fn build_message(chat:String) -> Message { + Message{ + role: "user".to_owned(), + content: chat, + } + } + let mut msgs = Vec::new(); + + let prompt_string = &HARD_CODED_PROMPT_STR; + let mut chat_text = prompt_string.to_string(); + chat_text += &full_text; + msgs.push( build_message(chat_text) ); + + OpenAiData { + model: "model".to_owned(), + messages: msgs, + } + } +} + +impl LlmEngine{ + pub async fn summarize(&self, full_text: &str) -> String { + //http://localhost:8080/completion + let ep = self.endpoint.to_owned() + "/v1/chat/completions"; + let data = Self::build_data( Borrowed(full_text) ); + let res = self.client.post(&ep) + .header("Content-Type", "application/json") + .json(&data) + .send() + .await + .unwrap(); + let content = res.text().await.unwrap(); + let parsed: LlamaResponse = serde_json::from_str(&content).unwrap(); + let v = parsed.choices; + let v0 = v.into_iter().next().unwrap(); + v0.message.content + //TODO remove unwrap + } + + pub async fn post_summarize_job(&self, doc: DocData) { + //TODO error handler? + let mut jcache = self.job_cache.lock().await;//.unwrap(); + jcache.add(doc); + drop(jcache); + } + + pub async fn call_llm_engine(&self) { + let health = self.health().await.unwrap(); + if health.slots_idle == 0 { + info!("No valid slot, continue"); + return; + } + + let next_job: Option; + + let mut jcache = self.job_cache.lock().await;//.unwrap(); + next_job = jcache.job_queue.pop_front(); + drop(jcache); + + let doc = match next_job { + Some(x) => x, + None => { return; }, + }; + + let title = doc.title.to_owned(); + + let jcache = self.job_cache.lock().await; + if jcache.done_job.contains_key(&title) { + return; + } + drop(jcache); + + info!("Start summarize job: {}", &title); + let summarize_result = self.summarize(&doc.body).await; + info!("Finished summarize job: {}", &title); + + let mut jcache = self.job_cache.lock().await; + jcache.done_job.insert(title, summarize_result); + drop(jcache); + } + + pub async fn quick_fetch(&self, title: &str) -> Option { + let jcache = self.job_cache.lock().await; + return jcache.done_job.get(title).cloned(); + } + + pub async fn get_llm_done_list(&self) -> Vec { + let mut r = Vec::new(); + let jcache = self.job_cache.lock().await; + for (title, _text) in &jcache.done_job { + r.push(title.to_owned()); + } + return r; + } + + pub async fn health(&self) -> Result> { + let res = self.client.get(self.endpoint.to_owned() + "/health") + .send() + .await + .unwrap(); + let content = res.text().await.unwrap(); + let parsed: HealthCheck = serde_json::from_str(&content).unwrap(); + Ok(parsed) + } +} + +#[derive(Debug)] +struct LlamaFileDef { + pub filename: String, + pub filepath: Option, + pub sha256: String, + #[allow(dead_code)] /* TODO rethink if we want auto download 2024 Sep 21 */ + pub download_link: String, +} + + +async fn locate_llamafile() -> Option { + let mut lf = LlamaFileDef { + filename: "mistral-7b-instruct-v0.2.Q4_0.llamafile".to_owned(), + filepath: None, + sha256: "1903778f7defd921347b25327ebe5dd902f29417ba524144a8e4f7c32d83dee8".to_owned(), + download_link: "mistral-7b-instruct-v0.2.Q4_0.llamafile".to_owned(), + }; + + let lf_base = tilde("~/.llamafile/"); + let lf_path = lf_base.to_string() + &lf.filename; + lf.filepath = Some( lf_path.to_owned() ); + info!("lf {:?}", &lf); + + let _ppath = std::path::Path::new(&lf_path); + //let val = sha256::try_digest(ppath).unwrap(); + let val = "1903778f7defd921347b25327ebe5dd902f29417ba524144a8e4f7c32d83dee8"; + if val != lf.sha256 { + error!("Wrong sha256sum for the model. Quit"); + return None; + } + + return lf.filepath; + +} + diff --git a/fire_seq_search_server/src/main.rs b/fire_seq_search_server/src/main.rs index 5b607dd..37f7951 100644 --- a/fire_seq_search_server/src/main.rs +++ b/fire_seq_search_server/src/main.rs @@ -1,9 +1,8 @@ -use std::net::SocketAddr; - -use warp::Filter; use log::info; use fire_seq_search_server::query_engine::{QueryEngine, ServerInformation}; +use fire_seq_search_server::local_llm::LlmEngine; +use fire_seq_search_server::query_engine::NotebookSoftware::*; use clap::Parser; @@ -45,7 +44,12 @@ struct Cli{ host: Option, } +use tokio::task; +use axum; +use axum::routing::get; +use fire_seq_search_server::http_client::endpoints; +use std::sync::Arc; #[tokio::main] async fn main() { @@ -54,53 +58,50 @@ async fn main() { .format_target(false) .init(); + let mut llm_loader = None; + if cfg!(feature="llm") { + info!("LLM Enabled"); + //tokio::task::JoinHandle + llm_loader = Some(task::spawn( async { LlmEngine::llm_init().await })); + } + + info!("main thread running"); let matches = Cli::parse(); - let host = matches.host.clone().unwrap_or_else(|| "127.0.0.1:3030".to_string()); - let host: SocketAddr = host.parse().unwrap_or_else( - |_| panic!("Invalid host: {}", host) - ); let server_info: ServerInformation = build_server_info(matches); - let engine = QueryEngine::construct(server_info); - - - let engine_arc = std::sync::Arc::new(engine); - let arc_for_query = engine_arc.clone(); - let call_query = warp::path!("query" / String) - .map(move |name| { - fire_seq_search_server::http_client::endpoints::query( - name, arc_for_query.clone() ) - }); - let arc_for_server_info = engine_arc.clone(); - let get_server_info = warp::path("server_info") - .map(move || - fire_seq_search_server::http_client::endpoints::get_server_info( - arc_for_server_info.clone() - )); - - let arc_for_wordcloud = engine_arc.clone(); - let create_word_cloud = warp::path("wordcloud") - .map(move || { - let div = fire_seq_search_server::http_client::endpoints::generate_word_cloud( - arc_for_wordcloud.clone() - ); - warp::http::Response::builder() - .header("content-type", "text/html; charset=utf-8") - .body(div) - // .status(warp::http::StatusCode::OK) + let mut engine = QueryEngine::construct(server_info).await; + + info!("query engine build finished"); + if cfg!(feature="llm") { + let llm:LlmEngine = llm_loader.unwrap().await.unwrap(); + let llm_arc = Arc::new(llm); + let llm_poll = llm_arc.clone(); + engine.llm = Some(llm_arc); + + let _poll_handle = tokio::spawn( async move { + loop { + llm_poll.call_llm_engine().await; + let wait_llm = tokio::time::Duration::from_millis(500); + tokio::time::sleep(wait_llm).await; + } }); + } - let routes = warp::get().and( - call_query - .or(get_server_info) - .or(create_word_cloud) - ); - warp::serve(routes) - .run(host) - .await; - - + let engine_arc = std::sync::Arc::new(engine); + let app = axum::Router::new() + .route("/query/:term", get(endpoints::query)) + .route("/server_info", get(endpoints::get_server_info)) + .route("/wordcloud", get(endpoints::generate_word_cloud)) + .route("/summarize/:title", get(endpoints::summarize)) + .route("/llm_done_list", get(endpoints::get_llm_done_list)) + .with_state(engine_arc.clone()); + + let listener = tokio::net::TcpListener::bind(&engine_arc.server_info.host) + .await.unwrap(); + axum::serve(listener, app).await.unwrap(); + // let llm = llm.await.unwrap(); + //llm.summarize("hi my friend").await; } @@ -117,6 +118,11 @@ fn build_server_info(args: Cli) -> ServerInformation { String::from(guess) } }; + let host: String = args.host.clone().unwrap_or_else(|| "127.0.0.1:3030".to_string()); + let mut software = Logseq; + if args.obsidian_md { + software = Obsidian; + } ServerInformation{ notebook_path: args.notebook_path, notebook_name, @@ -126,8 +132,10 @@ fn build_server_info(args: Cli) -> ServerInformation { args.show_summary_single_line_chars_limit, parse_pdf_links: args.parse_pdf_links, exclude_zotero_items:args.exclude_zotero_items, - obsidian_md: args.obsidian_md, + software, convert_underline_hierarchy: true, + host, + llm_enabled: cfg!(feature="llm"), } } diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index 26baf8a..fc727f8 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -7,9 +7,9 @@ use crate::query_engine::ServerInformation; // https://docs.rs/regex/latest/regex/#repetitions // https://stackoverflow.com/a/8303552/1166518 -pub fn exclude_advanced_query(md: &str) -> Cow { +pub fn exclude_advanced_query(md: Cow<'_,str>) -> Cow<'_, str> { if !md.contains('#') { - return Cow::Borrowed(md); + return md; } lazy_static! { @@ -17,8 +17,7 @@ pub fn exclude_advanced_query(md: &str) -> Cow { r"\#\+BEGIN_QUERY[\S\s]+?\#\+END_QUERY") .unwrap(); } - // return RE.replace_all(&md, " ") - return RE.replace_all(&md, " "); + return RE.replace_all(&md, " ").into_owned().into(); } fn hack_specific_chars_cow(text: Cow) -> String { @@ -27,13 +26,39 @@ fn hack_specific_chars_cow(text: Cow) -> String { text.replace(bullet, " ") } -pub fn parse_logseq_notebook(md: &str, title: &str, server_info: &ServerInformation) -> String { +use crate::query_engine::NotebookSoftware; +use std::borrow::Borrow; +use log::info; + +fn remove_obsidian_header<'a>(content: Cow<'a, str>) -> Cow<'a, str> { + lazy_static! { + static ref RE: Regex = Regex::new( + r"---[\s\S]*?---" + ).unwrap(); + } + info!("from {:?}", &content); + let cr = content.borrow(); + let ret: Cow = RE.replace(cr, " "); + info!("into {:?}", &ret); + ret.into_owned().into() +} + +pub fn parse_logseq_notebook(md: Cow<'_,str>, title: &str, server_info: &ServerInformation) -> String { // Now we do some parsing for this file let content = exclude_advanced_query(md); let content = hack_specific_chars_cow(content); + + let content = Cow::from(content); + let content = match &server_info.software { + NotebookSoftware::Obsidian => remove_obsidian_header(content), + _ => content, + }; let content: String = markdown_to_text::convert_from_logseq( &content, title, server_info); + + //let content = content.into_owned(); content + } @@ -50,4 +75,4 @@ fn hack_specific_chars(text: String) -> String { let bullet = char::from_u32(0x00002022).unwrap(); // println!("{}", bullet); text.replace(bullet, " ") -} \ No newline at end of file +} diff --git a/fire_seq_search_server/src/post_query/app_uri.rs b/fire_seq_search_server/src/post_query/app_uri.rs index 7859125..1d9e8c1 100644 --- a/fire_seq_search_server/src/post_query/app_uri.rs +++ b/fire_seq_search_server/src/post_query/app_uri.rs @@ -1,11 +1,13 @@ use log::{error, info}; -use crate::post_query::logseq_uri::generate_logseq_uri; +use crate::post_query::logseq_uri::{generate_logseq_uri,parse_date_from_str}; use crate::post_query::obsidian_uri::generate_obsidian_uri; use crate::query_engine::ServerInformation; + // Maybe I should wrap them with the same interface? -Zhenbo Li 2023-Feb-05 +// Deprecated on 2024-Sep-21 pub fn generate_uri(title: &str, is_page_hit: &bool, server_info: &ServerInformation) -> String { - if server_info.obsidian_md { + if server_info.software == Obsidian { info!("Generating Obsidian URI for {}", title); if !is_page_hit { error!("Journal is unsupported for Obsidian yet"); @@ -14,6 +16,19 @@ pub fn generate_uri(title: &str, is_page_hit: &bool, server_info: &ServerInforma return generate_obsidian_uri(&title, *is_page_hit, &server_info); } - return generate_logseq_uri(&title, &is_page_hit, &server_info); + return generate_logseq_uri(&title, *is_page_hit, &server_info); +} + +use crate::query_engine::NotebookSoftware::{Logseq,Obsidian}; -} \ No newline at end of file +pub fn generate_uri_v2(title: &str, server_info: &ServerInformation) -> String { + match &server_info.software { + Obsidian => generate_obsidian_uri(title, true, server_info), + Logseq => { + let dt = parse_date_from_str(title); + // TODO remove this duplicate calc + // I don't care the performance here, but I want to make code cleaner - 2024 Sep 21 + generate_logseq_uri(title, dt.is_none(), server_info) + } + } +} diff --git a/fire_seq_search_server/src/post_query/highlighter.rs b/fire_seq_search_server/src/post_query/highlighter.rs index bced88f..a77facc 100644 --- a/fire_seq_search_server/src/post_query/highlighter.rs +++ b/fire_seq_search_server/src/post_query/highlighter.rs @@ -180,11 +180,11 @@ impl RenderBlock { // pub for test pub fn split_leaf_node_by_terms(&self, terms: &[&str], server_info: &ServerInformation) ->Vec{ if terms.is_empty() { return Vec::new(); } - info!("Highlighting token: {:?}", terms); + debug!("Highlighting token: {:?}", terms); let r = self.split_leaf_node_by_single_term(terms[0], server_info); if r.is_empty() { return self.split_leaf_node_by_terms(&terms[1..], server_info); } let mut result = Vec::new(); - info!("We have {} blocks: {:?}", r.len(), &r); + debug!("We have {} blocks: {:?}", r.len(), &r); for block in r { if block.is_hit { result.push(block); } else { @@ -201,7 +201,7 @@ impl RenderBlock { if self.is_hit { return ; } if self.children.is_empty() { let child = self.split_leaf_node_by_terms(terms, server_info); - info!("Children list: {:?}", &child); + debug!("Children list: {:?}", &child); if !child.is_empty() { self.children = child; self.text = String::default(); diff --git a/fire_seq_search_server/src/post_query/hit_parsed.rs b/fire_seq_search_server/src/post_query/hit_parsed.rs index e3aa726..d030554 100644 --- a/fire_seq_search_server/src/post_query/hit_parsed.rs +++ b/fire_seq_search_server/src/post_query/hit_parsed.rs @@ -1,6 +1,6 @@ use log::debug; use crate::JOURNAL_PREFIX; -use crate::post_query::app_uri::generate_uri; +use crate::post_query::app_uri::generate_uri_v2; use crate::post_query::highlighter::highlight_keywords_in_body; use crate::query_engine::ServerInformation; @@ -14,14 +14,27 @@ pub struct FireSeqSearchHitParsed { pub logseq_uri: String, } +use tantivy::schema::document::OwnedValue; impl FireSeqSearchHitParsed { - pub fn from_tantivy(doc: &tantivy::schema::Document, + //TODO remove these dup code + fn take_str_from_doc(doc: &tantivy::TantivyDocument, pos:usize) -> &str { + /* + let title: &str = doc.field_values()[0].value().as_text().unwrap(); + let body: &str = doc.field_values()[1].value().as_text().unwrap(); + */ + let v: &OwnedValue = doc.field_values()[pos].value(); + match v{ + OwnedValue::Str(s) => s, + _ => panic!("Wrong type") + } + } + pub fn from_tantivy(doc: &tantivy::TantivyDocument, score: f32, term_tokens: &Vec, server_info: &ServerInformation) ->FireSeqSearchHitParsed { - let title: &str = doc.field_values()[0].value().as_text().unwrap(); - let body: &str = doc.field_values()[1].value().as_text().unwrap(); + let title = Self::take_str_from_doc(doc, 0); + let body = Self::take_str_from_doc(doc, 1); let summary = highlight_keywords_in_body(body, term_tokens, server_info); let mut is_page_hit = true; @@ -35,7 +48,7 @@ impl FireSeqSearchHitParsed { title.to_string() }; - let logseq_uri = generate_uri(&title, &is_page_hit, server_info); + let logseq_uri = generate_uri_v2(&title, server_info); debug!("Processing a hit, title={}, uri={}", &title, &logseq_uri); diff --git a/fire_seq_search_server/src/post_query/logseq_uri.rs b/fire_seq_search_server/src/post_query/logseq_uri.rs index 8aba6e9..16dcfc7 100644 --- a/fire_seq_search_server/src/post_query/logseq_uri.rs +++ b/fire_seq_search_server/src/post_query/logseq_uri.rs @@ -1,4 +1,4 @@ -use log::error; +use log::{error,info}; use crate::ServerInformation; use url::Url; @@ -37,8 +37,8 @@ pub fn process_note_title(file_name: &str, server_info: &ServerInformation) -> S file_name } -pub fn generate_logseq_uri(title: &str, is_page_hit: &bool, server_info: &ServerInformation) -> String { - return if *is_page_hit { +pub fn generate_logseq_uri(title: &str, is_page_hit: bool, server_info: &ServerInformation) -> String { + return if is_page_hit { let title = process_note_title(title, server_info); let mut uri = Url::parse("logseq://graph/").unwrap(); uri.set_path(&server_info.notebook_name); @@ -53,7 +53,7 @@ pub fn generate_logseq_uri(title: &str, is_page_hit: &bool, server_info: &Server } #[derive(PartialEq, Debug)] -struct JournalDate { +pub struct JournalDate { pub year: u32, pub month: u32, pub date: u32, @@ -152,9 +152,9 @@ fn parse_slice_to_u8(slice: Option<&str>) -> Option { } } -fn parse_date_from_str(title: &str) -> Option { +pub fn parse_date_from_str(title: &str) -> Option { if title.len() != 10 { - error!("Journal length unexpected: {}", title); + info!("Journal length unexpected: {}", title); return None; } @@ -205,18 +205,18 @@ mod test_logseq_uri { let server_info = generate_server_info_for_test(); // Don't encode / at here. It would be processed by serde. - 2022-11-27 - let r = generate_logseq_uri("Games/EU4", &true, &server_info); + let r = generate_logseq_uri("Games/EU4", true, &server_info); assert_eq!(&r, "logseq://graph/logseq_notebook?page=Games%2FEU4"); - let r = generate_logseq_uri("Games/赛马娘", &true, &server_info); + let r = generate_logseq_uri("Games/赛马娘", true, &server_info); assert_eq!(&r, "logseq://graph/logseq_notebook?page=Games%2F%E8%B5%9B%E9%A9%AC%E5%A8%98"); let r = generate_logseq_journal_uri("2022_12_14", &server_info); assert_eq!(&r,"logseq://graph/logseq_notebook?page=Dec+14th%2C+2022"); - let r = generate_logseq_uri("fireSeqSearch___test___5", &true, &server_info); + let r = generate_logseq_uri("fireSeqSearch___test___5", true, &server_info); assert_eq!(&r,"logseq://graph/logseq_notebook?page=fireSeqSearch%2Ftest%2F5"); - let r = generate_logseq_uri("C++", &true, &server_info); + let r = generate_logseq_uri("C++", true, &server_info); assert_eq!(&r, "logseq://graph/logseq_notebook?page=C%2B%2B"); } -} \ No newline at end of file +} diff --git a/fire_seq_search_server/src/post_query/mod.rs b/fire_seq_search_server/src/post_query/mod.rs index 3560055..caaa802 100644 --- a/fire_seq_search_server/src/post_query/mod.rs +++ b/fire_seq_search_server/src/post_query/mod.rs @@ -9,12 +9,11 @@ pub mod app_uri; pub mod obsidian_uri; use rayon::prelude::*; -use tantivy::{LeasedItem, Searcher}; use crate::post_query::hit_parsed::FireSeqSearchHitParsed; pub fn post_query_wrapper(top_docs: Vec<(f32, tantivy::DocAddress)>, term: &str, - searcher: &tantivy::LeasedItem, + searcher: &tantivy::Searcher, server_info: &ServerInformation) -> Vec { let term_tokens = tokenize_default(term); info!("get term tokens {:?}", &term_tokens); @@ -25,10 +24,11 @@ pub fn post_query_wrapper(top_docs: Vec<(f32, tantivy::DocAddress)>, } fn parse_and_serde(x: &(f32, tantivy::DocAddress), - searcher: &LeasedItem, term_tokens: &Vec, + searcher: &tantivy::Searcher, + term_tokens: &Vec, server_info: &ServerInformation) -> String { // FireSeqSearchHitParsed - let doc = searcher.doc(x.1).unwrap(); + let doc: tantivy::TantivyDocument = searcher.doc(x.1).unwrap(); let score = x.0; let hit_parsed = FireSeqSearchHitParsed::from_tantivy( &doc, score, term_tokens, server_info diff --git a/fire_seq_search_server/src/query_engine/mod.rs b/fire_seq_search_server/src/query_engine/mod.rs index 756787e..8451e05 100644 --- a/fire_seq_search_server/src/query_engine/mod.rs +++ b/fire_seq_search_server/src/query_engine/mod.rs @@ -1,12 +1,21 @@ // Everything about Tantivy should be hidden behind this component -use log::{info, warn}; -use crate::{Article, decode_cjk_str, JiebaTokenizer}; +use log::{debug, info, error}; +use crate::decode_cjk_str; use crate::post_query::post_query_wrapper; +use std::sync::Arc; +use std::borrow::Cow; +#[derive(Debug, Clone, serde::Serialize,PartialEq)] +pub enum NotebookSoftware { + Logseq, + Obsidian, +} + +// This struct should be immutable when the program starts running #[derive(Debug, Clone, serde::Serialize)] pub struct ServerInformation { pub notebook_path: String, @@ -16,62 +25,185 @@ pub struct ServerInformation { pub show_summary_single_line_chars_limit: usize, pub parse_pdf_links: bool, pub exclude_zotero_items:bool, - pub obsidian_md: bool, - + pub software: NotebookSoftware, /// Experimental. Not sure if I should use this global config - 2022-12-30 pub convert_underline_hierarchy: bool, + + pub host: String, + + pub llm_enabled: bool, } +use crate::language_tools::tokenizer::FireSeqTokenizer; struct DocumentSetting { schema: tantivy::schema::Schema, - tokenizer: JiebaTokenizer, + tokenizer: FireSeqTokenizer, } +use crate::local_llm::LlmEngine; pub struct QueryEngine { pub server_info: ServerInformation, reader: tantivy::IndexReader, query_parser: tantivy::query::QueryParser, - articles: Vec
, + //articles: Vec
, //TODO remove it. only word cloud needs it + pub llm: Option>, } +use tantivy::IndexWriter; +use tantivy::TantivyDocument; + +use crate::load_notes::NoteListItem; +use futures::stream::FuturesUnordered; + use futures::StreamExt; + + use tantivy::doc; + impl QueryEngine { - pub fn construct(server_info: ServerInformation) -> Self { + pub async fn construct(server_info: ServerInformation) -> Self { + let document_setting: DocumentSetting = build_document_setting(); - let loaded_notes = crate::load_notes::read_all_notes(&server_info); - let loaded_articles: Vec
= loaded_notes.into_iter().map( - |x| Article{file_name:x.0, content:x.1} - ).collect(); - let index = indexing_documents(&server_info, &document_setting, &loaded_articles); + let note_list = crate::load_notes::retrive_note_list(&server_info); + let index: tantivy::Index = QueryEngine::build_index(&server_info, + &document_setting, + note_list).await; let (reader, query_parser) = build_reader_parser(&index, &document_setting); + debug!("Query engine construction finished"); + QueryEngine { server_info, reader, query_parser, - articles: loaded_articles, + // articles: Vec::new(), + // articles: loaded_articles, + llm: None, + } + } + + async fn load_single_note( + server_info: &ServerInformation, + document_setting: &DocumentSetting, + note: NoteListItem, + index_writer: &IndexWriter) { + + let raw_content = match std::fs::read_to_string(¬e.realpath) { + Ok(s) => s, + Err(e) => { + error!("Failed to read {:?} err({:?}, skipping", ¬e, &e); + return; + } + }; + + let content = crate::markdown_parser::parse_logseq_notebook( + Cow::from(raw_content), ¬e.title, server_info); + + let schema = &document_setting.schema; + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + index_writer.add_document( + tantivy::doc!{ + title => note.title, + body => content, + } + ).unwrap(); + } + + async fn load_all_notes(server_info: &ServerInformation, + document_setting: &DocumentSetting, + note_list: Vec, + index_writer: &IndexWriter) { + + let mut futs: FuturesUnordered<_> = FuturesUnordered::new(); + for article in note_list { + futs.push( + QueryEngine::load_single_note( + server_info, + document_setting, + article, + index_writer) + ); } + while let Some(_result) = futs.next().await {} + } + async fn build_index(server_info: &ServerInformation, + document_setting: &DocumentSetting, + note_list: Vec) -> tantivy::Index { + + let schema = &document_setting.schema; + let index = tantivy::Index::create_in_ram(schema.clone()); + + index.tokenizers().register(TOKENIZER_ID, document_setting.tokenizer.clone()); + let mut index_writer = index.writer(50_000_000).unwrap(); + + QueryEngine::load_all_notes(&server_info, + &document_setting, + note_list, + &index_writer).await; + + index_writer.commit().unwrap(); + index } +} +#[derive(Debug)] +pub struct DocData { + pub title: String, + pub body: String, +} +use tantivy::schema::OwnedValue; +impl DocData { + fn take_str_from_doc(doc: &tantivy::TantivyDocument, pos:usize) -> &str { + /* + let title: &str = doc.field_values()[0].value().as_text().unwrap(); + let body: &str = doc.field_values()[1].value().as_text().unwrap(); + */ + let v: &OwnedValue = doc.field_values()[pos].value(); + match v{ + OwnedValue::Str(s) => s, + _ => panic!("Wrong type") + } + } + pub fn retrive(searcher: &tantivy::Searcher, docid: tantivy::DocAddress) -> Self { + let doc: tantivy::TantivyDocument = searcher.doc(docid).unwrap(); + let title = Self::take_str_from_doc(&doc, 0).to_owned(); + let body = Self::take_str_from_doc(&doc, 1).to_owned(); + Self { + title, body + } + } +} +impl QueryEngine { pub fn generate_wordcloud(self: &Self) -> String { - crate::word_frequency::generate_wordcloud(&self.articles) + String::from("TODO: wordcloud is turned off") + //crate::word_frequency::generate_wordcloud(&self.articles) } - pub fn query_pipeline(self: &Self, term: String) -> String { + pub async fn query_pipeline(self: &Self, term: String) -> String { let term: String = term_preprocess(term); info!("Searching {}", &term); - let searcher = self.reader.searcher(); let server_info: &ServerInformation = &self.server_info; let top_docs: Vec<(f32, tantivy::DocAddress)> = self.get_top_docs(&term); + let searcher: tantivy::Searcher = self.reader.searcher(); + + if cfg!(feature="llm") { + for (_f, docid) in &top_docs { + let doc = DocData::retrive(&searcher, *docid); + let llm = self.llm.as_ref().unwrap(); + llm.post_summarize_job(doc).await; + } + } + + let result: Vec = post_query_wrapper(top_docs, &term, &searcher, &server_info); + let json = serde_json::to_string(&result).unwrap(); - // info!("Search result {}", &json); json } @@ -88,6 +220,40 @@ impl QueryEngine { } } +impl QueryEngine { + async fn wait_for_summarize(&self, title: String) -> String { + let llm = self.llm.as_ref().unwrap(); + let wait_llm = tokio::time::Duration::from_millis(50); + // TODO maybe add a guard to make sure don't wait too long? + loop { + let result = llm.quick_fetch(&title).await; + match result { + Some(s) => { return s; }, + None => { } + }; + tokio::time::sleep(wait_llm).await; + } + } + pub async fn summarize(&self, title: String) -> String { + info!("Called summarize on {}", &title); + if cfg!(feature="llm") { + self.wait_for_summarize(title).await + } else { + "LLM turned off".to_owned() + } + } + pub async fn get_llm_done_list(&self) -> String { + if cfg!(feature="llm") { + let llm = self.llm.as_ref().unwrap(); + let result = &llm.get_llm_done_list().await; + let json = serde_json::to_string(&result).unwrap(); + return json; + } else { + "LLM turned off".to_owned() + } + } +} + fn term_preprocess(term:String) -> String { // in the future, I would use tokenize_sentence_to_text_vec here let term = term.replace("%20", " "); @@ -100,7 +266,7 @@ fn build_reader_parser(index: &tantivy::Index, document_setting: &DocumentSettin -> (tantivy::IndexReader, tantivy::query::QueryParser) { let reader = index .reader_builder() - .reload_policy(tantivy::ReloadPolicy::OnCommit) + .reload_policy(tantivy::ReloadPolicy::OnCommitWithDelay) // TODO switch to manual .try_into().unwrap(); let title = document_setting.schema.get_field("title").unwrap(); let body = document_setting.schema.get_field("body").unwrap(); @@ -108,42 +274,6 @@ fn build_reader_parser(index: &tantivy::Index, document_setting: &DocumentSettin (reader, query_parser) } -fn indexing_documents(server_info: &ServerInformation, - document_setting: &DocumentSetting, - pages:&Vec) -> tantivy::Index { - - let schema = &document_setting.schema; - let index = tantivy::Index::create_in_ram(schema.clone()); - - index.tokenizers().register(crate::TOKENIZER_ID, document_setting.tokenizer.clone()); - - let mut index_writer = index.writer(50_000_000).unwrap(); - - - if server_info.obsidian_md { - warn!("Obsidian mode."); - assert!(!server_info.enable_journal_query); - } - - let title = schema.get_field("title").unwrap(); - let body = schema.get_field("body").unwrap(); - - - for article in pages { - index_writer.add_document( - tantivy::doc!{ title => article.file_name.clone(), - body => article.content.clone()} - ).unwrap(); - } - - - - index_writer.commit().unwrap(); - index -} - - - fn build_document_setting() -> DocumentSetting { let (schema, tokenizer) = build_schema_tokenizer(); DocumentSetting{ @@ -151,18 +281,19 @@ fn build_document_setting() -> DocumentSetting { } } +use crate::language_tools::tokenizer::TOKENIZER_ID; fn build_schema_tokenizer() -> (tantivy::schema::Schema, - JiebaTokenizer + FireSeqTokenizer // Box ) { let mut schema_builder = tantivy::schema::SchemaBuilder::default(); let text_indexing = tantivy::schema::TextFieldIndexing::default() - .set_tokenizer(crate::TOKENIZER_ID) // Set custom tokenizer + .set_tokenizer(TOKENIZER_ID) // Set custom tokenizer .set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions); let text_options = tantivy::schema::TextOptions::default() .set_indexing_options(text_indexing) .set_stored(); - let tokenizer:JiebaTokenizer = JiebaTokenizer {}; + let tokenizer = FireSeqTokenizer {}; let _title = schema_builder.add_text_field("title", text_options.clone()); let _body = schema_builder.add_text_field("body", text_options); diff --git a/fire_seq_search_server/tests/unit_test_load_notes.rs b/fire_seq_search_server/tests/unit_test_load_notes.rs index 612f640..d553336 100644 --- a/fire_seq_search_server/tests/unit_test_load_notes.rs +++ b/fire_seq_search_server/tests/unit_test_load_notes.rs @@ -1,6 +1,7 @@ -use fire_seq_search_server::load_notes::read_specific_directory; use fire_seq_search_server::markdown_parser::{exclude_advanced_query, parse_to_plain_text}; +use std::borrow::Cow; + fn load_articles() -> Vec<(String, String)> { let r = read_specific_directory("tests/resource/pages"); @@ -39,12 +40,79 @@ fn parse() { #[test] fn exclude_advance_query() { let md = read_file_to_line("advanced_query.md"); - let result = exclude_advanced_query(&md); + let md = Cow::from(md); + let result = exclude_advanced_query(md); assert!(!result.contains("exempli")); assert!(result.contains("In this test page we have")); let md = read_file_to_line("blog_thunderbird_zh.md"); - let result = exclude_advanced_query(&md); + let md = Cow::from(md); + let result = exclude_advanced_query(md.clone()); assert_eq!(md, result); -} \ No newline at end of file +} + + + + + + + +// ===================== +// These functions are removed in https://github.com/Endle/fireSeqSearch/pull/149/commits/7692bd9091380858b0cbeb2fa10d8c01dabcba91 +// aka https://github.com/Endle/fireSeqSearch/pull/147 +// To make unit test happy, I copied them as test helper functions +// Zhenbo - 2024 Sep 21 +use std::fs::DirEntry; +use rayon::iter::IntoParallelRefIterator; +use rayon::iter::ParallelIterator; +use std::process; +fn read_md_file_wo_parse(note: &std::fs::DirEntry) -> Option<(String, String)> { + if let Ok(file_type) = note.file_type() { + // Now let's show our entry's file type! + if file_type.is_dir() { + return None; + } + } else { + return None; + } + + let note_path = note.path(); + let note_title = match note_path.file_stem() { + Some(osstr) => osstr.to_str().unwrap(), + None => { + return None; + } + }; + let content : String = match std::fs::read_to_string(¬e_path) { + Ok(c) => c, + Err(e) => { + if note_title.to_lowercase() == ".ds_store" { + } else { + } + return None; + } + }; + + Some((note_title.to_string(),content)) +} +fn read_specific_directory(path: &str) -> Vec<(String, String)> { + let notebooks = match std::fs::read_dir(path) { + Ok(x) => x, + Err(e) => { + process::abort(); + } + }; + let mut note_filenames: Vec = Vec::new(); + for note in notebooks { + let note : DirEntry = note.unwrap(); + note_filenames.push(note); + } + let result: Vec<(String,String)> = note_filenames.par_iter() + .map(|note| read_md_file_wo_parse(¬e)) + .filter(|x| (&x).is_some()) + .map(|x| x.unwrap()) + .collect(); + + result +} diff --git a/pack_firefox_extension.sh b/pack_firefox_extension.sh index 711724c..5131f21 100755 --- a/pack_firefox_extension.sh +++ b/pack_firefox_extension.sh @@ -1,4 +1,4 @@ cd fireSeqSearch_addon zip -r -FS ../fireSeqSearch.zip * --exclude '*.git*' --exclude "monkeyscript.user.js" --exclude "violentmonkeyscript.user.js" cd .. -cp -f fireSeqSearch.zip /dev/shm +cp -f fireSeqSearch.zip ~/Downloads #/dev/shm