-
-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #21 from HMUNACHI/dev
Tokenizer
- Loading branch information
Showing
11 changed files
with
235 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import os | ||
from typing import List, Optional | ||
from sentencepiece import SentencePieceProcessor, SentencePieceTrainer | ||
|
||
class Tokenizer: | ||
""" | ||
A tokenizer class that utilizes SentencePiece to encode and decode text. | ||
This class can be initialized with either an existing SentencePiece model | ||
or a dataset to train a new model. It provides methods to encode a string | ||
to a list of token ids and decode a list of token ids back to a string. | ||
Attributes: | ||
sp_model (SentencePieceProcessor): The SentencePiece processor. | ||
n_words (int): Number of words in the vocabulary. | ||
bos_id (int): Token id for the beginning of a sentence. | ||
eos_id (int): Token id for the end of a sentence. | ||
pad_id (int): Token id for padding. | ||
Example usage: | ||
Training a new model and encoding/decoding a string: | ||
```python | ||
# Initialize tokenizer with training data and train a new model. | ||
text_paths = ['/Users/mac1/Desktop/nanodl/nanodl/__src/utils/sample.txt'] | ||
tokenizer = Tokenizer(training_data=text_paths, | ||
vocab_size=100, | ||
model_type='bpe', | ||
max_sentence_length=50) | ||
# Encode a sentence. | ||
encoded_sentence = tokenizer.encode('Hello, world!') | ||
print(f'Encoded: {encoded_sentence}') | ||
# Decode the encoded sentence. | ||
decoded_sentence = tokenizer.decode(encoded_sentence) | ||
print(f'Decoded: {decoded_sentence}') | ||
``` | ||
Loading an existing model and encoding/decoding a string: | ||
```python | ||
# Initialize tokenizer with a pre-trained model. | ||
tokenizer = Tokenizer(model_path='path/to/model.model') | ||
# Encode a sentence. | ||
encoded_sentence = tokenizer.encode('Hello, world!') | ||
print(f'Encoded: {encoded_sentence}') | ||
# Decode the encoded sentence. | ||
decoded_sentence = tokenizer.decode(encoded_sentence) | ||
print(f'Decoded: {decoded_sentence}') | ||
``` | ||
""" | ||
def __init__(self, | ||
training_data: List[str], | ||
vocab_size: int, | ||
model_type: str = "bpe", | ||
max_sentence_length: int = 512, | ||
model_path: Optional[str] = None): | ||
|
||
if model_path and os.path.isfile(model_path): | ||
# Load an existing model | ||
self.sp_model = SentencePieceProcessor(model_file=model_path) | ||
elif training_data and all(os.path.isfile(f) for f in training_data): | ||
# Train a new model using a list of data files | ||
input_files = ','.join(training_data) | ||
model_prefix = "trained_model" | ||
SentencePieceTrainer.train( | ||
input=input_files, | ||
model_prefix=model_prefix, | ||
vocab_size=vocab_size, | ||
model_type=model_type, | ||
max_sentence_length=max_sentence_length, | ||
) | ||
|
||
self.sp_model = SentencePieceProcessor(model_file=f"{model_prefix}.model") | ||
else: | ||
raise ValueError("Must provide either a model_path or a non-empty training_data list") | ||
|
||
# Initialize token IDs | ||
self.n_words: int = self.sp_model.vocab_size() | ||
self.bos_id: int = self.sp_model.bos_id() | ||
self.eos_id: int = self.sp_model.eos_id() | ||
self.pad_id: int = self.sp_model.pad_id() | ||
|
||
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() | ||
|
||
def encode(self, | ||
s: str, | ||
bos: bool = True, | ||
eos: bool = False) -> List[int]: | ||
"""Converts a string into a list of tokens.""" | ||
assert isinstance(s, str) | ||
t = self.sp_model.encode(s) | ||
if bos: | ||
t = [self.bos_id] + t | ||
if eos: | ||
t = t + [self.eos_id] | ||
return t | ||
|
||
def decode(self, | ||
t: List[int]) -> str: | ||
"""Converts a list of tokens back into a string.""" | ||
return self.sp_model.decode(t) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,4 +2,5 @@ jax | |
jaxlib | ||
flax | ||
optax | ||
einops | ||
einops | ||
sentencepiece |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Hello, world! This is a test of the Tokenizer. | ||
Let's see how it tokenizes this file. | ||
Another sentence to check the tokenization process. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
<unk> 0 | ||
<s> 0 | ||
</s> 0 | ||
▁t -0 | ||
is -1 | ||
en -2 | ||
es -3 | ||
iz -4 | ||
ok -5 | ||
▁T -6 | ||
his -7 | ||
eniz -8 | ||
okeniz -9 | ||
He -10 | ||
Le -11 | ||
ee -12 | ||
er -13 | ||
fi -14 | ||
he -15 | ||
ho -16 | ||
it -17 | ||
ld -18 | ||
le -19 | ||
ll -20 | ||
of -21 | ||
or -22 | ||
▁a -23 | ||
▁s -24 | ||
▁w -25 | ||
Let -26 | ||
est -27 | ||
how -28 | ||
llo -29 | ||
▁He -30 | ||
▁fi -31 | ||
▁is -32 | ||
▁it -33 | ||
▁of -34 | ||
orld -35 | ||
▁Let -36 | ||
▁how -37 | ||
▁see -38 | ||
▁the -39 | ||
▁This -40 | ||
▁file -41 | ||
▁test -42 | ||
▁this -43 | ||
▁Hello -44 | ||
▁world -45 | ||
okenizer -46 | ||
okenizes -47 | ||
▁Tokenizer -48 | ||
▁tokenizes -49 | ||
Th -50 | ||
To -51 | ||
el -52 | ||
et -53 | ||
hi -54 | ||
il -55 | ||
ke -56 | ||
lo -57 | ||
ni -58 | ||
ow -59 | ||
rl -60 | ||
se -61 | ||
st -62 | ||
te -63 | ||
th -64 | ||
to -65 | ||
wo -66 | ||
ze -67 | ||
▁H -68 | ||
▁L -69 | ||
▁f -70 | ||
▁h -71 | ||
▁i -72 | ||
▁o -73 | ||
▁ -74 | ||
e -75 | ||
i -76 | ||
s -77 | ||
t -78 | ||
o -79 | ||
h -80 | ||
l -81 | ||
. -82 | ||
T -83 | ||
f -84 | ||
k -85 | ||
n -86 | ||
r -87 | ||
w -88 | ||
z -89 | ||
! -90 | ||
' -91 | ||
, -92 | ||
H -93 | ||
L -94 | ||
a -95 | ||
d -96 |