Changes for the v0.1.0 in the rust version

IteraLabs · Oct 19, 2024 · 66b26b6 · 66b26b6
1 parent 52c8bcc
commit 66b26b6
Show file tree

Hide file tree

Showing 10 changed files with 124 additions and 39 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,34 +1,41 @@
 
-# -- Local Files ------------------------------------------------------------ #
-# -- ----------- ------------------------------------------------------------ #
+# -- Local Files ---------------------------------------------------------------------- #
+# -- ----------- ---------------------------------------------------------------------- #
 
 *.pdf
 *.gguf
 *.pdf
 models/
-# -- Rust ------------------------------------------------------------------- #
-# -- ---- ------------------------------------------------------------------- #
+
+# -- ChromaDB ------------------------------------------------------------------------- #
+# -- ---- ----------------------------------------------------------------------------- #
+
+chroma/
+chroma_langchain_db/
+
+# -- Rust ----------------------------------------------------------------------------- #
+# -- ---- ----------------------------------------------------------------------------- #
 
 /rust/target
 /rust/Cargo.lock
 
-# -- Python ----------------------------------------------------------------- #
-# -- ------ ----------------------------------------------------------------- #
+# -- Python --------------------------------------------------------------------------- #
+# -- ------ --------------------------------------------------------------------------- #
 
 *.cpython-311-darwin.so
 __pycache__/
 
-# -- Python ----------------------------------------------------------------- #
-# -- ------ ----------------------------------------------------------------- #
+# -- Python --------------------------------------------------------------------------- #
+# -- ------ --------------------------------------------------------------------------- #
 
 .python-version
 pyrightconfig.json
 files/
 *.egg-info
 
 
-# -- File Structure --------------------------------------------------------- #
-# -- -------------- --------------------------------------------------------- #
+# -- File Structure ------------------------------------------------------------------- #
+# -- -------------- ------------------------------------------------------------------- #
 
 *.whl
 build/

diff --git a/rust/Cargo.toml b/rust/Cargo.toml
@@ -24,6 +24,6 @@ llm_models = "0.0.1"
 
 [lib]
 name = "molina"
-crate-type = ["lib"]
+crate-type = ["cdylib"]
 test = true
 doctest = true
diff --git a/rust/examples/extract_content.rs b/rust/examples/extract_content.rs
@@ -1,5 +1,5 @@
 use molina::content::extract;
-
+use molina::data::loader;
 use std::error::Error;
 use llm_models::tokenizer::LlmTokenizer;
 use std::path::PathBuf;
@@ -9,28 +9,35 @@ fn main() -> Result<(), Box<dyn Error>> {
     let wd_folder: String = "/Users/franciscome/git/iteralabs/molina".to_owned();
     let in_folder: &str = "/knowledge";
     let in_subfolder: &str = "/conference_icml";
-    let in_file: &str = "/alon22a.pdf";
+    let in_file: &str = "/mao24c.pdf";
     let in_path = wd_folder.clone() + in_folder + in_subfolder + in_file;
+
+    // -------------------------------------------------------------- FILES LOADING -- //
+    // -------------------------------------------------------------- ------------- -- //
 
+    //let v_files = loader::load_files(&in_path);
+    //println!("v_files is: {:?}", &v_files);
+
     // -- ------------------------------------------------------ CONTENT EXTRACTION -- //
     // -- ------------------------------------------------------ ------------------ -- //
 
+    //let in_file: &str = "/1-s2.0-S0032063323002052-main.pdf";
     let r_extraction = extract::extract_text(&in_path);
-    let raw_document = &r_extraction.unwrap();
-
-    println!("\nDoc's page content: \n\n {:?}", raw_document[&1]);
+    println!("r_extraction has: {:?}", &r_extraction);
+
+    let raw_document = &r_extraction?;
+    println!("\nDoc's page content: \n\n {:?}", raw_document[&4]);
 
     // -- ------------------------------------------------------------ TOKENIZATION -- //
     // -- ------------------------------------------------------------ ------------ -- //
 
     let in_tok = wd_folder.clone() + "/models/Meta-Llama-3-8B-Instruct/tokenizer.json";
     let path_buf = PathBuf::from(in_tok);
-
     let llama_tokenizer = LlmTokenizer::new_from_tokenizer_json(&path_buf)?;
-    let llama_tokens = llama_tokenizer.tokenize("This is a sentence");
-
-    println!("\nValidation: Token count: {}", llama_tokens.len());
-    println!("Validation: Downloaded meta/llama3 {:?}\n", llama_tokens);
+
+    // let llama_tokens = llama_tokenizer.tokenize("This is a sentence");
+    //println!("\nValidation: Token count: {}", llama_tokens.len());
+    //println!("Validation: Downloaded meta/llama3 {:?}\n", llama_tokens);
 
     let tokenized_doc = &raw_document[&2];
     let tokens_doc = llama_tokenizer.tokenize(tokenized_doc);

diff --git a/rust/src/content/extract.rs b/rust/src/content/extract.rs
@@ -14,7 +14,6 @@
 //! Concrete Strategies: string, table, image, latex, code
 //!
 
-// use crate::content::filter::filter_content;
 use crate::content::{filter, process};
 use crate::messages::errors;
 use lopdf::Document;
@@ -28,7 +27,7 @@ pub fn extract_text<P: AsRef<Path>>(
     path: P,
 ) -> Result<BTreeMap<u32, String>, errors::ContentError> {
     // Attempt to Load Document
-    let r_load = Document::load_filtered(path, filter::filter_content).map_err(|_| {
+    let mut r_load = Document::load_filtered(path, filter::filter_content).map_err(|_| {
         errors::ContentError::ContentNotFound(String::from(
             "During Attempt to Load Document",
         ))
@@ -38,13 +37,21 @@ pub fn extract_text<P: AsRef<Path>>(
     let mut b_extract = BTreeMap::new();
     let size_document = r_load.get_pages().len() as u32;
 
+    // Change documents metadata
+    //r_load.change_producer("pdfTeX-1.40.20");
+
     for i in 1..=size_document {
+
         let i_text = r_load.extract_text(&vec![i]).map_err(|_| {
-            errors::ContentError::UnsuccessfulExtraction(String::from(
-                "During Attempt to Extract Text",
-            ))
+
+            // println!("{:?}", r_load.catalog());
+
+            let err_message:String = String::from("Error during extraction").to_owned();
+            let full_err_message = err_message.clone();
+
+            errors::ContentError::UnsuccessfulExtraction(
+            full_err_message)
         });
-        // pre-process extracted text before inserting into BTreeMap
 
         // to lower case
         let r0_text = process::preprocess_text(&i_text?);

diff --git a/rust/src/content/filter.rs b/rust/src/content/filter.rs
@@ -4,7 +4,9 @@
 
 use lopdf::Object;
 
-pub static SIMPLE_FILTER: &[&str] = &["Length"];
+pub static SIMPLE_FILTER: &[&str] = &[
+    "Length",
+];
 
 pub static DEFAULT_FILTER: &[&str] = &[
     "Length",
@@ -43,7 +45,7 @@ pub fn filter_content(
     }
 
     if let Ok(result) = object.as_dict_mut() {
-        result.remove(b"Producer");
+        // result.remove(b"Producer");
         result.remove(b"ModDate");
         result.remove(b"Creator");
         result.remove(b"ProcSet");

diff --git a/rust/src/content/tokenize.rs b/rust/src/content/tokenize.rs
@@ -0,0 +1,21 @@
+
+use llm_models::tokenizer::LlmTokenizer;
+use crate::messages::errors;
+use std::path::PathBuf;
+
+pub fn tokenize_content(text: &str) -> Result< Vec<u32>, errors::ContentError > {
+
+    let wd_folder: String = "/Users/franciscome/git/iteralabs/molina".to_owned();
+    let in_folder: &str = "/models";
+    let in_subfolder: &str = "/Meta-Llama-3-8B-Instruct";
+    let in_file: &str = "/tokenizer.json";
+    let in_path = wd_folder.clone() + in_folder + in_subfolder + in_file;
+    let path_buf: PathBuf = PathBuf::from(in_path);
+
+    let llama_tokenizer = LlmTokenizer::new_from_tokenizer_json(&path_buf).unwrap();
+    let llama_tokens = llama_tokenizer.tokenize(text);
+
+    Ok(llama_tokens)
+
+}
+
diff --git a/rust/src/data/db.rs b/rust/src/data/db.rs
@@ -1,2 +1 @@
-// Placeholder
-
+pub mod loader;
diff --git a/rust/src/data/loader.rs b/rust/src/data/loader.rs
@@ -0,0 +1,30 @@
+use std::fs;
+use std::path::Path;
+
+pub fn load_files(dir: &str) -> Vec<String> {
+    let mut pdf_files = Vec::new();
+    let path = Path::new(dir);
+
+    // Recursively visit each directory and collect PDF file paths
+    if path.is_dir() {
+
+        // Use fs::read_dir to iterate through entries in the directory
+        if let Ok(entries) = fs::read_dir(path) {
+            for entry in entries.filter_map(Result::ok) {
+                let entry_path = entry.path();
+
+                // Check if the entry is a directory or a file
+                if entry_path.is_dir() {
+                    // Recursion for subdirectories
+                    pdf_files.extend(load_files(entry_path.to_str().unwrap()));
+                } else if entry_path.extension().map(|s| s == "pdf").unwrap_or(false) {
+                    // If it's a PDF file, add its path to the vector
+                    pdf_files.push(entry_path.to_string_lossy().into_owned());
+                }
+            }
+        }
+    }
+
+    pdf_files
+}
+
diff --git a/rust/src/data/mod.rs b/rust/src/data/mod.rs
@@ -1 +1,2 @@
 // Placeholder
+pub mod loader;
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
@@ -26,7 +26,6 @@ pub mod inference;
 /// Structs and logic for Events, Custom Error Types, Logs
 pub mod messages;
 
-// use lopdf::Document;
 // ### extract_content
 // Extract's the PDFs content as indicated with the params.
 #[pyfunction]
@@ -49,18 +48,30 @@ fn extract_content(input_file: &str) -> PyResult<BTreeMap<u32, String>> {
     Ok(text)
 }
 
-// ### separate_content
-// sepparation into chunks with size as selected
-// {sub-character, character, sentence, paragraph, section, page}
+
+// ### tokenize_content
+// Take content and tokenize it with a previoulsy downloaded tokenizer
 #[pyfunction]
-fn split_content() -> PyResult<()> {
-    // println!("split_content call");
-    Ok(())
+fn tokenize_content(input_text: &str) -> PyResult<Vec<u32>> {
+
+    let tokenized = content::tokenize::tokenize_content(input_text).map_err(|e| {
+        match e {
+            messages::errors::ContentError::ContentNotFound(msg) => {
+                    pyo3::exceptions::PyValueError::new_err(msg)
+                }
+            messages::errors::ContentError::UnsuccessfulExtraction(msg) => {
+                    pyo3::exceptions::PyIndexError::new_err(msg)
+                }
+            }
+        }
+    )?;
+
+    Ok(tokenized)
 }
 
 #[pymodule]
 fn molina(_py: Python, m: &PyModule) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(extract_content, m)?)?;
-    m.add_function(wrap_pyfunction!(split_content, m)?)?;
+    m.add_function(wrap_pyfunction!(tokenize_content, m)?)?;
     Ok(())
 }