Skip to content

Commit

Permalink
Changes for the v0.1.0 in the rust version
Browse files Browse the repository at this point in the history
  • Loading branch information
IFFranciscoME committed Oct 19, 2024
1 parent 52c8bcc commit 66b26b6
Show file tree
Hide file tree
Showing 10 changed files with 124 additions and 39 deletions.
27 changes: 17 additions & 10 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,34 +1,41 @@

# -- Local Files ------------------------------------------------------------ #
# -- ----------- ------------------------------------------------------------ #
# -- Local Files ---------------------------------------------------------------------- #
# -- ----------- ---------------------------------------------------------------------- #

*.pdf
*.gguf
*.pdf
models/
# -- Rust ------------------------------------------------------------------- #
# -- ---- ------------------------------------------------------------------- #

# -- ChromaDB ------------------------------------------------------------------------- #
# -- ---- ----------------------------------------------------------------------------- #

chroma/
chroma_langchain_db/

# -- Rust ----------------------------------------------------------------------------- #
# -- ---- ----------------------------------------------------------------------------- #

/rust/target
/rust/Cargo.lock

# -- Python ----------------------------------------------------------------- #
# -- ------ ----------------------------------------------------------------- #
# -- Python --------------------------------------------------------------------------- #
# -- ------ --------------------------------------------------------------------------- #

*.cpython-311-darwin.so
__pycache__/

# -- Python ----------------------------------------------------------------- #
# -- ------ ----------------------------------------------------------------- #
# -- Python --------------------------------------------------------------------------- #
# -- ------ --------------------------------------------------------------------------- #

.python-version
pyrightconfig.json
files/
*.egg-info


# -- File Structure --------------------------------------------------------- #
# -- -------------- --------------------------------------------------------- #
# -- File Structure ------------------------------------------------------------------- #
# -- -------------- ------------------------------------------------------------------- #

*.whl
build/
Expand Down
2 changes: 1 addition & 1 deletion rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ llm_models = "0.0.1"

[lib]
name = "molina"
crate-type = ["lib"]
crate-type = ["cdylib"]
test = true
doctest = true
27 changes: 17 additions & 10 deletions rust/examples/extract_content.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use molina::content::extract;

use molina::data::loader;
use std::error::Error;
use llm_models::tokenizer::LlmTokenizer;
use std::path::PathBuf;
Expand All @@ -9,28 +9,35 @@ fn main() -> Result<(), Box<dyn Error>> {
let wd_folder: String = "/Users/franciscome/git/iteralabs/molina".to_owned();
let in_folder: &str = "/knowledge";
let in_subfolder: &str = "/conference_icml";
let in_file: &str = "/alon22a.pdf";
let in_file: &str = "/mao24c.pdf";
let in_path = wd_folder.clone() + in_folder + in_subfolder + in_file;

// -------------------------------------------------------------- FILES LOADING -- //
// -------------------------------------------------------------- ------------- -- //

//let v_files = loader::load_files(&in_path);
//println!("v_files is: {:?}", &v_files);

// -- ------------------------------------------------------ CONTENT EXTRACTION -- //
// -- ------------------------------------------------------ ------------------ -- //

//let in_file: &str = "/1-s2.0-S0032063323002052-main.pdf";
let r_extraction = extract::extract_text(&in_path);
let raw_document = &r_extraction.unwrap();

println!("\nDoc's page content: \n\n {:?}", raw_document[&1]);
println!("r_extraction has: {:?}", &r_extraction);

let raw_document = &r_extraction?;
println!("\nDoc's page content: \n\n {:?}", raw_document[&4]);

// -- ------------------------------------------------------------ TOKENIZATION -- //
// -- ------------------------------------------------------------ ------------ -- //

let in_tok = wd_folder.clone() + "/models/Meta-Llama-3-8B-Instruct/tokenizer.json";
let path_buf = PathBuf::from(in_tok);

let llama_tokenizer = LlmTokenizer::new_from_tokenizer_json(&path_buf)?;
let llama_tokens = llama_tokenizer.tokenize("This is a sentence");

println!("\nValidation: Token count: {}", llama_tokens.len());
println!("Validation: Downloaded meta/llama3 {:?}\n", llama_tokens);

// let llama_tokens = llama_tokenizer.tokenize("This is a sentence");
//println!("\nValidation: Token count: {}", llama_tokens.len());
//println!("Validation: Downloaded meta/llama3 {:?}\n", llama_tokens);

let tokenized_doc = &raw_document[&2];
let tokens_doc = llama_tokenizer.tokenize(tokenized_doc);
Expand Down
19 changes: 13 additions & 6 deletions rust/src/content/extract.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
//! Concrete Strategies: string, table, image, latex, code
//!
// use crate::content::filter::filter_content;
use crate::content::{filter, process};
use crate::messages::errors;
use lopdf::Document;
Expand All @@ -28,7 +27,7 @@ pub fn extract_text<P: AsRef<Path>>(
path: P,
) -> Result<BTreeMap<u32, String>, errors::ContentError> {
// Attempt to Load Document
let r_load = Document::load_filtered(path, filter::filter_content).map_err(|_| {
let mut r_load = Document::load_filtered(path, filter::filter_content).map_err(|_| {
errors::ContentError::ContentNotFound(String::from(
"During Attempt to Load Document",
))
Expand All @@ -38,13 +37,21 @@ pub fn extract_text<P: AsRef<Path>>(
let mut b_extract = BTreeMap::new();
let size_document = r_load.get_pages().len() as u32;

// Change documents metadata
//r_load.change_producer("pdfTeX-1.40.20");

for i in 1..=size_document {

let i_text = r_load.extract_text(&vec![i]).map_err(|_| {
errors::ContentError::UnsuccessfulExtraction(String::from(
"During Attempt to Extract Text",
))

// println!("{:?}", r_load.catalog());

let err_message:String = String::from("Error during extraction").to_owned();
let full_err_message = err_message.clone();

errors::ContentError::UnsuccessfulExtraction(
full_err_message)
});
// pre-process extracted text before inserting into BTreeMap

// to lower case
let r0_text = process::preprocess_text(&i_text?);
Expand Down
6 changes: 4 additions & 2 deletions rust/src/content/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
use lopdf::Object;

pub static SIMPLE_FILTER: &[&str] = &["Length"];
pub static SIMPLE_FILTER: &[&str] = &[
"Length",
];

pub static DEFAULT_FILTER: &[&str] = &[
"Length",
Expand Down Expand Up @@ -43,7 +45,7 @@ pub fn filter_content(
}

if let Ok(result) = object.as_dict_mut() {
result.remove(b"Producer");
// result.remove(b"Producer");
result.remove(b"ModDate");
result.remove(b"Creator");
result.remove(b"ProcSet");
Expand Down
21 changes: 21 additions & 0 deletions rust/src/content/tokenize.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@

use llm_models::tokenizer::LlmTokenizer;
use crate::messages::errors;
use std::path::PathBuf;

pub fn tokenize_content(text: &str) -> Result< Vec<u32>, errors::ContentError > {

let wd_folder: String = "/Users/franciscome/git/iteralabs/molina".to_owned();
let in_folder: &str = "/models";
let in_subfolder: &str = "/Meta-Llama-3-8B-Instruct";
let in_file: &str = "/tokenizer.json";
let in_path = wd_folder.clone() + in_folder + in_subfolder + in_file;
let path_buf: PathBuf = PathBuf::from(in_path);

let llama_tokenizer = LlmTokenizer::new_from_tokenizer_json(&path_buf).unwrap();
let llama_tokens = llama_tokenizer.tokenize(text);

Ok(llama_tokens)

}

3 changes: 1 addition & 2 deletions rust/src/data/db.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
// Placeholder

pub mod loader;
30 changes: 30 additions & 0 deletions rust/src/data/loader.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use std::fs;
use std::path::Path;

pub fn load_files(dir: &str) -> Vec<String> {
let mut pdf_files = Vec::new();
let path = Path::new(dir);

// Recursively visit each directory and collect PDF file paths
if path.is_dir() {

// Use fs::read_dir to iterate through entries in the directory
if let Ok(entries) = fs::read_dir(path) {
for entry in entries.filter_map(Result::ok) {
let entry_path = entry.path();

// Check if the entry is a directory or a file
if entry_path.is_dir() {
// Recursion for subdirectories
pdf_files.extend(load_files(entry_path.to_str().unwrap()));
} else if entry_path.extension().map(|s| s == "pdf").unwrap_or(false) {
// If it's a PDF file, add its path to the vector
pdf_files.push(entry_path.to_string_lossy().into_owned());
}
}
}
}

pdf_files
}

1 change: 1 addition & 0 deletions rust/src/data/mod.rs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
// Placeholder
pub mod loader;
27 changes: 19 additions & 8 deletions rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ pub mod inference;
/// Structs and logic for Events, Custom Error Types, Logs
pub mod messages;

// use lopdf::Document;
// ### extract_content
// Extract's the PDFs content as indicated with the params.
#[pyfunction]
Expand All @@ -49,18 +48,30 @@ fn extract_content(input_file: &str) -> PyResult<BTreeMap<u32, String>> {
Ok(text)
}

// ### separate_content
// sepparation into chunks with size as selected
// {sub-character, character, sentence, paragraph, section, page}

// ### tokenize_content
// Take content and tokenize it with a previoulsy downloaded tokenizer
#[pyfunction]
fn split_content() -> PyResult<()> {
// println!("split_content call");
Ok(())
fn tokenize_content(input_text: &str) -> PyResult<Vec<u32>> {

let tokenized = content::tokenize::tokenize_content(input_text).map_err(|e| {
match e {
messages::errors::ContentError::ContentNotFound(msg) => {
pyo3::exceptions::PyValueError::new_err(msg)
}
messages::errors::ContentError::UnsuccessfulExtraction(msg) => {
pyo3::exceptions::PyIndexError::new_err(msg)
}
}
}
)?;

Ok(tokenized)
}

#[pymodule]
fn molina(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(extract_content, m)?)?;
m.add_function(wrap_pyfunction!(split_content, m)?)?;
m.add_function(wrap_pyfunction!(tokenize_content, m)?)?;
Ok(())
}

0 comments on commit 66b26b6

Please sign in to comment.