Skip to content

Commit

Permalink
Rename pretokenizers -> pre_tokenizers
Browse files Browse the repository at this point in the history
Align with the naming convention in the tokenizers crate.
  • Loading branch information
robertknight committed Dec 4, 2024
1 parent 61835a2 commit d49d6b2
Show file tree
Hide file tree
Showing 7 changed files with 25 additions and 24 deletions.
2 changes: 1 addition & 1 deletion rten-generate/src/text_decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ mod tests {
use std::collections::HashMap;

use rten_text::models::{Bpe, WordPiece};
use rten_text::pretokenizers::ByteLevelPreTokenizer;
use rten_text::pre_tokenizers::ByteLevelPreTokenizer;
use rten_text::tokenizers::{TokenId, Tokenizer};

use crate::{GeneratorError, GeneratorUtils};
Expand Down
2 changes: 1 addition & 1 deletion rten-text/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
pub mod models;
pub mod normalizer;
pub mod pretokenizers;
pub mod pre_tokenizers;
pub mod tokenizers;

mod split;
2 changes: 1 addition & 1 deletion rten-text/src/models/bpe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,7 @@ mod tests {
use std::collections::HashMap;

use super::{merge_pairs_from_lines, Bpe, EncodedBytes};
use crate::pretokenizers::ByteLevelPreTokenizer;
use crate::pre_tokenizers::ByteLevelPreTokenizer;
use crate::tokenizers::{TokenId, Tokenizer};

// The first ~25 lines of the merge list from GPT 2.
Expand Down
2 changes: 1 addition & 1 deletion rten-text/src/models/wordpiece.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ mod tests {

use crate::models::{WordPiece, WordPieceOptions};
use crate::normalizer::{BertNormalizer, BertNormalizerOptions, Normalizer};
use crate::pretokenizers::BertPreTokenizer;
use crate::pre_tokenizers::BertPreTokenizer;
use crate::tokenizers::{Tokenizer, TokenizerOptions};

fn create_tokenizer(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ impl fmt::Display for PreTokenizeError {
/// tokenized by a [`Model`](crate::tokenizers::Model) individually.
pub trait PreTokenizer {
/// Split `text` into chunks and return a vector of sub-slices.
fn pretokenize<'a>(&self, text: &'a str) -> Result<Vec<&'a str>, PreTokenizeError>;
fn pre_tokenize<'a>(&self, text: &'a str) -> Result<Vec<&'a str>, PreTokenizeError>;
}

/// Tokenization regex used by GPT-2.
Expand Down Expand Up @@ -59,7 +59,7 @@ impl ByteLevelPreTokenizer {
}

impl PreTokenizer for ByteLevelPreTokenizer {
fn pretokenize<'a>(&self, text: &'a str) -> Result<Vec<&'a str>, PreTokenizeError> {
fn pre_tokenize<'a>(&self, text: &'a str) -> Result<Vec<&'a str>, PreTokenizeError> {
self.splitter
.find_iter(text)
.filter_map(|piece| match piece {
Expand Down Expand Up @@ -91,7 +91,7 @@ impl Default for BertPreTokenizer {
}

impl PreTokenizer for BertPreTokenizer {
fn pretokenize<'a>(&self, text: &'a str) -> Result<Vec<&'a str>, PreTokenizeError> {
fn pre_tokenize<'a>(&self, text: &'a str) -> Result<Vec<&'a str>, PreTokenizeError> {
let is_punc_or_space =
|ch: char| ch.is_ascii_punctuation() || ch.is_punctuation() || ch.is_whitespace();
let words = text.split_keep_delimeters(is_punc_or_space).collect();
Expand Down
33 changes: 17 additions & 16 deletions rten-text/src/tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use std::ops::Range;

use crate::models::{merge_pairs_from_lines, Bpe, BpeError, WordPiece};
use crate::normalizer::{BertNormalizer, BertNormalizerOptions, Normalizer};
use crate::pretokenizers::{
use crate::pre_tokenizers::{
BertPreTokenizer, ByteLevelPreTokenizer, PreTokenizeError, PreTokenizer,
};
use crate::split::SliceExt;
Expand Down Expand Up @@ -274,7 +274,7 @@ pub struct TokenizerOptions<'a> {
/// into overlapping chunks and truncating long sequences.
pub struct Tokenizer {
normalizer: Option<Box<dyn Normalizer>>,
pretokenizer: Option<Box<dyn PreTokenizer>>,
pre_tokenizer: Option<Box<dyn PreTokenizer>>,
model: Box<dyn Model>,

/// Token added at start of output.
Expand All @@ -289,7 +289,7 @@ impl Tokenizer {
pub fn new<M: Model + 'static>(model: M, options: TokenizerOptions) -> Tokenizer {
Tokenizer {
model: Box::new(model),
pretokenizer: None,
pre_tokenizer: None,
normalizer: None,
cls_token: options.cls_token.map(|t| t.to_string()),
sep_token: options.sep_token.map(|t| t.to_string()),
Expand All @@ -304,7 +304,7 @@ impl Tokenizer {

/// Configure the pre-tokenizer used by this tokenizer.
pub fn with_pre_tokenizer(mut self, pre_tokenizer: Box<dyn PreTokenizer>) -> Self {
self.pretokenizer = Some(pre_tokenizer);
self.pre_tokenizer = Some(pre_tokenizer);
self
}

Expand Down Expand Up @@ -334,13 +334,14 @@ impl Tokenizer {
normalizer
});

let pretokenizer: Option<Box<dyn PreTokenizer>> = json.pre_tokenizer.map(|pretokenizer| {
let pretokenizer: Box<dyn PreTokenizer> = match pretokenizer {
json::PreTokenizer::Bert => Box::new(BertPreTokenizer::new()),
json::PreTokenizer::ByteLevel => Box::new(ByteLevelPreTokenizer::gpt2()),
};
pretokenizer
});
let pre_tokenizer: Option<Box<dyn PreTokenizer>> =
json.pre_tokenizer.map(|pre_tokenizer| {
let pre_tokenizer: Box<dyn PreTokenizer> = match pre_tokenizer {
json::PreTokenizer::Bert => Box::new(BertPreTokenizer::new()),
json::PreTokenizer::ByteLevel => Box::new(ByteLevelPreTokenizer::gpt2()),
};
pre_tokenizer
});

let mut tokenizer = match json.model {
json::Model::Bpe(model) => {
Expand Down Expand Up @@ -397,8 +398,8 @@ impl Tokenizer {
tokenizer = tokenizer.with_normalizer(normalizer);
}

if let Some(pretokenizer) = pretokenizer {
tokenizer = tokenizer.with_pre_tokenizer(pretokenizer);
if let Some(pre_tokenizer) = pre_tokenizer {
tokenizer = tokenizer.with_pre_tokenizer(pre_tokenizer);
}

Ok(tokenizer)
Expand Down Expand Up @@ -495,9 +496,9 @@ impl Tokenizer {
};

let chunks = self
.pretokenizer
.pre_tokenizer
.as_ref()
.map(|pt| pt.pretokenize(&normalized))
.map(|pt| pt.pre_tokenize(&normalized))
.transpose()
.map_err(TokenizerError::PreTokenizeError)?
.unwrap_or(Vec::from([normalized.as_str()]));
Expand Down Expand Up @@ -758,7 +759,7 @@ mod tests {

use super::{EncodeOptions, EncoderInput, TokenId, Tokenizer, TokenizerOptions, WordPiece};
use crate::normalizer::{BertNormalizer, BertNormalizerOptions, Normalizer};
use crate::pretokenizers::BertPreTokenizer;
use crate::pre_tokenizers::BertPreTokenizer;
use serde::Deserialize;

fn make_wordpiece(vocab: &[&str]) -> WordPiece {
Expand Down
2 changes: 1 addition & 1 deletion rten-text/tests/reftest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::path::PathBuf;

use rten_text::models::{merge_pairs_from_lines, Bpe, WordPiece};
use rten_text::normalizer::{BertNormalizer, BertNormalizerOptions};
use rten_text::pretokenizers::{BertPreTokenizer, ByteLevelPreTokenizer};
use rten_text::pre_tokenizers::{BertPreTokenizer, ByteLevelPreTokenizer};
use rten_text::tokenizers::{TokenId, Tokenizer, TokenizerOptions};
use serde::Deserialize;

Expand Down

0 comments on commit d49d6b2

Please sign in to comment.