Skip to content

Commit

Permalink
From loading, extraction and tokenization progress
Browse files Browse the repository at this point in the history
  • Loading branch information
IFFranciscoME committed Oct 17, 2024
1 parent f97cac0 commit 5bc09f3
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 27 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ exclude = [".git/**", ".github/**"]

# [project]
# name = "molina"
# dependencies = ["toml", "transformers", "SentecePiece", "protobuf", "torch"]
# dependencies = ["toml", "transformers",
# "accelerate","bitsandbytes", "SentecePiece", "protobuf", "torch"]
# requires-python = ">=3.10"
# keywords = ["llm", "rust"]
# classifier = [
Expand Down
141 changes: 141 additions & 0 deletions python/examples/llama_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@

from molina import extract_content
import torch
from torch import cuda
import transformers
from transformers import AutoTokenizer
from time import time
from huggingface_hub import login
login(token="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")

def main():

"""
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 \
langchain==0.0.300 xformers==0.0.21 bitsandbytes==0.41.1 \
sentence_transformers==2.2.2 chromadb==0.4.12
"""

wd_folder = "/Users/franciscome/git/iteralabs/molina"
in_folder = "/knowledge"
in_subfolder = "/conference_icml"
in_file = "/basu24a.pdf"
in_pdf = wd_folder + in_folder + in_subfolder + in_file
# in_tokenizer = wd_folder + "/models/Meta-Llama-3-8B-Instruct/"

# -- --------------------------------------------------------------------------- -- #
# -- --------------------------------------------------------------------------- -- #

result_content = extract_content(input_file=in_pdf)

# print(f"the resulting keys were: {result_content.keys()}")
# print(result_content[1])

# -- --------------------------------------------------------------------------- -- #
# -- --------------------------------------------------------------------------- -- #

model_id = 'meta-llama/Llama-3.2-3B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_id)

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'mps'
print(device)

time_start = time()
model_config = transformers.AutoConfig.from_pretrained(
model_id,
trust_remote_code=True,
max_new_tokens=1024
)

model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
config=model_config,
quantization_config=None,
device_map='auto',
)

time_end = time()

print(f"Prepare model, tokenizer: {round(time_end-time_start, 3)} sec.")

# -- --------------------------------------------------------------------------- -- #
# -- --------------------------------------------------------------------------- -- #

time_start = time()
query_pipeline = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.float16,
max_length=1024,
device_map="auto",)
time_end = time()

print(f"Prepare pipeline: {round(time_end-time_start, 3)} sec.")

# -- --------------------------------------------------------------------------- -- #
# -- --------------------------------------------------------------------------- -- #

def test_model(tokenizer, pipeline, message):
"""
Perform a query
print the result
Args:
tokenizer: the tokenizer
pipeline: the pipeline
message: the prompt
Returns
None
"""

time_start = time()
sequences = pipeline(
message,
do_sample=True,
top_k=10,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
max_length=200,)

time_end = time()
total_time = f"{round(time_end-time_start, 3)} sec."

question = sequences[0]['generated_text'][:len(message)]
answer = sequences[0]['generated_text'][len(message):]

return f"Question: {question}\nAnswer: {answer}\nTotal time: {total_time}"

# -- --------------------------------------------------------------------------- -- #
# -- --------------------------------------------------------------------------- -- #

"""
Starts a terminal chat session to interact with the model.
Args:
tokenizer: The tokenizer for the model.
query_pipeline: The query processing pipeline for the model.
Returns:
None
"""
print("Welcome to the chat! Type 'exit' to end the session.")

while True:
# Gather input from the user
in_query = input("You: ")

# Exit condition
if in_query.lower() == 'exit':
print("Ending chat session.")
break

# Get response from the model
response = test_model(tokenizer, query_pipeline, in_query)

# Print the response
print(f"Model: {response}")

if __name__ == "__main__":
main()

14 changes: 12 additions & 2 deletions python/molina/main.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,32 @@
from molina import extract_content
import torch
import transformers

def main():

"""
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 \
langchain==0.0.300 xformers==0.0.21 bitsandbytes==0.41.1 \
sentence_transformers==2.2.2 chromadb==0.4.12
"""

wd_folder = "/Users/franciscome/git/iteralabs/molina"
in_folder = "/knowledge"
in_subfolder = "/conference_icml"
in_file = "/basu24a.pdf"
in_pdf = wd_folder + in_folder + in_subfolder + in_file
# in_tokenizer = wd_folder + "/models/Meta-Llama-3-8B-Instruct/"

# -- --------------------------------------------------------------------------- -- #
# -- --------------------------------------------------------------------------- -- #

result_content = extract_content(input_file=in_pdf)

print(f"the resulting keys were: {result_content.keys()}")
print(result_content[1])

# -- --------------------------------------------------------------------------- -- #
# -- --------------------------------------------------------------------------- -- #
# https://huggingface.co/docs/.../auto#transformers.AutoTokenizer.from_pretrained


if __name__ == "__main__":
main()
Expand Down
4 changes: 2 additions & 2 deletions rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
thiserror = { version = "1.0.64" }
tokenizers = { version = "0.20.1", features = ["http"]}

llm_models = "0.0.1"

[lib]
name = "molina"
crate-type = ["cdylib"]
crate-type = ["lib"]
test = true
doctest = true
42 changes: 20 additions & 22 deletions rust/examples/extract_content.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
use molina::content::extract;
// use tokenizers::tokenizer::{Tokenizer, Result};
// use tokenizers::models::bpe::BPE;

fn main() {
use std::error::Error;
use llm_models::tokenizer::LlmTokenizer;
use std::path::PathBuf;

fn main() -> Result<(), Box<dyn Error>> {

let wd_folder: String = "/Users/franciscome/git/iteralabs/molina".to_owned();
let in_folder: &str = "/knowledge";
let in_subfolder: &str = "/conference_icml";
let in_file: &str = "/alon22a.pdf";
let in_path = wd_folder + in_folder + in_subfolder + in_file;
let in_path = wd_folder.clone() + in_folder + in_subfolder + in_file;

// -- ------------------------------------------------------ CONTENT EXTRACTION -- //
// -- ------------------------------------------------------ ------------------ -- //
Expand All @@ -19,27 +22,22 @@ fn main() {

// -- ------------------------------------------------------------ TOKENIZATION -- //
// -- ------------------------------------------------------------ ------------ -- //

let in_tok = wd_folder.clone() + "/models/Meta-Llama-3-8B-Instruct/tokenizer.json";
let path_buf = PathBuf::from(in_tok);

// let tokenized_doc = &raw_document[&2];
// let tokens = tokenizer.encode(tokenized_doc, false)?;

//println!("Token Count: {}", tokens.len());
//println!("Tokens: {:?}", tokens);

// Print the raw contents, structured as indicated in the docs
// https://docs.rs/lopdf/latest/lopdf/struct.Document.html
// let obj_document: Vec<_> = raw_document.objects.clone().into_values().collect();
// println!("The objects that compose the PDF: {:?}", obj_document);
let llama_tokenizer = LlmTokenizer::new_from_tokenizer_json(&path_buf)?;
let llama_tokens = llama_tokenizer.tokenize("This is a sentence");

// Get all the keys
// println!("No. of Keys is: {:?}", &first_text.text.keys().len());
println!("\nValidation: Token count: {}", llama_tokens.len());
println!("Validation: Downloaded meta/llama3 {:?}\n", llama_tokens);

// Return the first K, V pair
// println!("First K,V \n{:?}", &first_text.text.first_key_value());
let tokenized_doc = &raw_document[&2];
let tokens_doc = llama_tokenizer.tokenize(tokenized_doc);

// Return the last K, V pair
//println!("Last K,V \n{:?}", &first_text.text.last_key_value());
println!("\nContent Len: {}", tokenized_doc.len());
println!("Tokenized Content Len: {}", tokens_doc.len());
println!("Actual Tokens: {:?}\n", tokens_doc);

// for a given key, get the content
// println!("Content {:?}", &first_text.text.get(&26));
Ok(())
}

0 comments on commit 5bc09f3

Please sign in to comment.