Skip to content

Pre-tokenizers that support multi-word/non-whitespace BPE in single pass #1753

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions bindings/python/examples/train_random_chunk_bpe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import argparse
import glob
import json
import os
from os.path import join

from tokenizers import Tokenizer, normalizers, trainers
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import RandomChunkSplit


parser = argparse.ArgumentParser()
parser.add_argument(
"--files",
default=None,
metavar="path",
type=str,
required=True,
help="The files to use as training; accept '**/*.txt' type of patterns \
if enclosed in quotes",
)
parser.add_argument(
"--out",
default="./",
type=str,
help="Path to the output directory, where the files will be saved",
)
parser.add_argument("--name", default="random-chunk-bpe", type=str, help="The name of the output vocab files")
parser.add_argument("--min-length", default=2, type=int, help="Minimum length of chunks")
parser.add_argument("--max-length", default=5, type=int, help="Maximum length of chunks")
parser.add_argument("--vocab-size", default=10000, type=int, help="Size of vocabulary")
parser.add_argument("--min-frequency", default=2, type=int, help="Minimum frequency for a token to be included")
args = parser.parse_args()

files = glob.glob(args.files)
if not files:
print(f"File does not exist: {args.files}")
exit(1)


# Initialize a tokenizer with BPE model
tokenizer = Tokenizer(BPE())

# Use RandomChunkSplit as pre-tokenizer
tokenizer.pre_tokenizer = RandomChunkSplit(min_length=args.min_length, max_length=args.max_length)

# Optional: Add NFKC normalization like SentencePieceBPE
tokenizer.normalizer = normalizers.NFKC()

# Configure the BPE trainer
trainer = trainers.BpeTrainer(
vocab_size=args.vocab_size,
min_frequency=args.min_frequency,
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
show_progress=True
)

# Train the model
print(f"Training BPE with RandomChunkSplit (min_length={args.min_length}, max_length={args.max_length})")
tokenizer.train(files, trainer)

# Save the trained tokenizer
output_path = join(args.out, f"{args.name}.json")
tokenizer.save(output_path)
print(f"Trained tokenizer saved to: {output_path}")

# Create an inference version without pre-tokenizer
# First save to a temporary file
temp_tokenizer_path = join(args.out, "temp_tokenizer.json")
tokenizer.save(temp_tokenizer_path)

# Read the JSON
with open(temp_tokenizer_path, "r") as f:
tokenizer_data = json.load(f)

# Remove pre-tokenizer field if present
if "pre_tokenizer" in tokenizer_data:
del tokenizer_data["pre_tokenizer"]

# Write modified tokenizer to inference file
inference_path = join(args.out, f"{args.name}_inference.json")
with open(inference_path, "w") as f:
json.dump(tokenizer_data, f, indent=2)

# Clean up temp file
os.remove(temp_tokenizer_path)

print(f"Inference-ready tokenizer (no pre-tokenizer) saved to: {inference_path}")

# Test encoding with inference tokenizer
tokenizer = Tokenizer.from_file(inference_path)
example = "Training BPE with multi-word tokens is very easy"
print(f"\nTest encoding: {tokenizer.encode(example).tokens}")
93 changes: 93 additions & 0 deletions bindings/python/examples/train_random_whitespace_bpe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import argparse
import glob
import json
import os
from os.path import join

from tokenizers import Tokenizer, normalizers, trainers
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import RandomWhitespaceSplit


parser = argparse.ArgumentParser()
parser.add_argument(
"--files",
default=None,
metavar="path",
type=str,
required=True,
help="The files to use as training; accept '**/*.txt' type of patterns \
if enclosed in quotes",
)
parser.add_argument(
"--out",
default="./",
type=str,
help="Path to the output directory, where the files will be saved",
)
parser.add_argument("--name", default="random-whitespace-bpe", type=str, help="The name of the output vocab files")
parser.add_argument("--split-prob", default=0.3, type=float, help="Probability of splitting at whitespace (0.0-1.0)")
parser.add_argument("--vocab-size", default=10000, type=int, help="Size of vocabulary")
parser.add_argument("--min-frequency", default=2, type=int, help="Minimum frequency for a token to be included")
args = parser.parse_args()

files = glob.glob(args.files)
if not files:
print(f"File does not exist: {args.files}")
exit(1)


# Initialize a tokenizer with BPE model
tokenizer = Tokenizer(BPE())

# Use RandomWhitespaceSplit as pre-tokenizer
tokenizer.pre_tokenizer = RandomWhitespaceSplit(split_probability=args.split_prob)

# Optional: Add NFKC normalization like SentencePieceBPE
tokenizer.normalizer = normalizers.NFKC()

# Configure the BPE trainer
trainer = trainers.BpeTrainer(
vocab_size=args.vocab_size,
min_frequency=args.min_frequency,
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
show_progress=True
)

# Train the model
print(f"Training BPE with RandomWhitespaceSplit (split_probability={args.split_prob})")
tokenizer.train(files, trainer)

# Save the trained tokenizer
output_path = join(args.out, f"{args.name}.json")
tokenizer.save(output_path)
print(f"Trained tokenizer saved to: {output_path}")

# Create an inference version without pre-tokenizer
# First save to a temporary file
temp_tokenizer_path = join(args.out, "temp_tokenizer.json")
tokenizer.save(temp_tokenizer_path)

# Read the JSON
with open(temp_tokenizer_path, "r") as f:
tokenizer_data = json.load(f)

# Remove pre-tokenizer field if present
if "pre_tokenizer" in tokenizer_data:
del tokenizer_data["pre_tokenizer"]

# Write modified tokenizer to inference file
inference_path = join(args.out, f"{args.name}_inference.json")
with open(inference_path, "w") as f:
json.dump(tokenizer_data, f, indent=2)

# Clean up temp file
os.remove(temp_tokenizer_path)

print(f"Inference-ready tokenizer (no pre-tokenizer) saved to: {inference_path}")

# Test encoding with inference tokenizer
tokenizer = Tokenizer.from_file(inference_path)
example = "Training BPE with multi-word tokens is very easy"
print(f"\nTest encoding: {tokenizer.encode(example).tokens}")

2 changes: 2 additions & 0 deletions bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
Digits = pre_tokenizers.Digits
Metaspace = pre_tokenizers.Metaspace
Punctuation = pre_tokenizers.Punctuation
RandomChunkSplit = pre_tokenizers.RandomChunkSplit
RandomWhitespaceSplit = pre_tokenizers.RandomWhitespaceSplit
Sequence = pre_tokenizers.Sequence
Split = pre_tokenizers.Split
UnicodeScripts = pre_tokenizers.UnicodeScripts
Expand Down
55 changes: 54 additions & 1 deletion bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,59 @@ class Punctuation(PreTokenizer):
"""
pass

class RandomChunkSplit(PreTokenizer):
"""
RandomChunkSplit PreTokenizer

This pre-tokenizer splits text into random-length chunks regardless of whitespace
boundaries. It's useful for enabling BPE to learn tokens that span across whitespace.

Args:
min_length (:obj:`int`, `optional`, defaults to :obj:`1`):
The minimum length (in characters) for each chunk.
max_length (:obj:`int`, `optional`, defaults to :obj:`5`):
The maximum length (in characters) for each chunk.
"""
def __init__(self, min_length=1, max_length=5):
pass

def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place

This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`

Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass

def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string

This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`

Args:
sequence (:obj:`str`):
A string to pre-tokeize

Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass

class Sequence(PreTokenizer):
"""
This pre-tokenizer composes other pre_tokenizers and applies them in sequence
Expand Down Expand Up @@ -607,4 +660,4 @@ class WhitespaceSplit(PreTokenizer):
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
pass
Loading