huggingface · mjbommar · Mar 22, 2025 · Mar 22, 2025 · Mar 22, 2025
diff --git a/bindings/python/examples/train_random_chunk_bpe.py b/bindings/python/examples/train_random_chunk_bpe.py
@@ -0,0 +1,93 @@
+import argparse
+import glob
+import json
+import os
+from os.path import join
+
+from tokenizers import Tokenizer, normalizers, trainers
+from tokenizers.models import BPE
+from tokenizers.pre_tokenizers import RandomChunkSplit
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--files",
+    default=None,
+    metavar="path",
+    type=str,
+    required=True,
+    help="The files to use as training; accept '**/*.txt' type of patterns \
+                          if enclosed in quotes",
+)
+parser.add_argument(
+    "--out",
+    default="./",
+    type=str,
+    help="Path to the output directory, where the files will be saved",
+)
+parser.add_argument("--name", default="random-chunk-bpe", type=str, help="The name of the output vocab files")
+parser.add_argument("--min-length", default=2, type=int, help="Minimum length of chunks")
+parser.add_argument("--max-length", default=5, type=int, help="Maximum length of chunks")
+parser.add_argument("--vocab-size", default=10000, type=int, help="Size of vocabulary")
+parser.add_argument("--min-frequency", default=2, type=int, help="Minimum frequency for a token to be included")
+args = parser.parse_args()
+
+files = glob.glob(args.files)
+if not files:
+    print(f"File does not exist: {args.files}")
+    exit(1)
+
+
+# Initialize a tokenizer with BPE model
+tokenizer = Tokenizer(BPE())
+
+# Use RandomChunkSplit as pre-tokenizer
+tokenizer.pre_tokenizer = RandomChunkSplit(min_length=args.min_length, max_length=args.max_length)
+
+# Optional: Add NFKC normalization like SentencePieceBPE
+tokenizer.normalizer = normalizers.NFKC()
+
+# Configure the BPE trainer
+trainer = trainers.BpeTrainer(
+    vocab_size=args.vocab_size,
+    min_frequency=args.min_frequency,
+    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
+    show_progress=True
+)
+
+# Train the model
+print(f"Training BPE with RandomChunkSplit (min_length={args.min_length}, max_length={args.max_length})")
+tokenizer.train(files, trainer)
+
+# Save the trained tokenizer
+output_path = join(args.out, f"{args.name}.json")
+tokenizer.save(output_path)
+print(f"Trained tokenizer saved to: {output_path}")
+
+# Create an inference version without pre-tokenizer
+# First save to a temporary file
+temp_tokenizer_path = join(args.out, "temp_tokenizer.json")
+tokenizer.save(temp_tokenizer_path)
+
+# Read the JSON
+with open(temp_tokenizer_path, "r") as f:
+    tokenizer_data = json.load(f)
+
+# Remove pre-tokenizer field if present
+if "pre_tokenizer" in tokenizer_data:
+    del tokenizer_data["pre_tokenizer"]
+
+# Write modified tokenizer to inference file
+inference_path = join(args.out, f"{args.name}_inference.json")
+with open(inference_path, "w") as f:
+    json.dump(tokenizer_data, f, indent=2)
+
+# Clean up temp file
+os.remove(temp_tokenizer_path)
+
+print(f"Inference-ready tokenizer (no pre-tokenizer) saved to: {inference_path}")
+
+# Test encoding with inference tokenizer
+tokenizer = Tokenizer.from_file(inference_path)
+example = "Training BPE with multi-word tokens is very easy"
+print(f"\nTest encoding: {tokenizer.encode(example).tokens}")
diff --git a/bindings/python/examples/train_random_whitespace_bpe.py b/bindings/python/examples/train_random_whitespace_bpe.py
@@ -0,0 +1,93 @@
+import argparse
+import glob
+import json
+import os
+from os.path import join
+
+from tokenizers import Tokenizer, normalizers, trainers
+from tokenizers.models import BPE
+from tokenizers.pre_tokenizers import RandomWhitespaceSplit
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--files",
+    default=None,
+    metavar="path",
+    type=str,
+    required=True,
+    help="The files to use as training; accept '**/*.txt' type of patterns \
+                          if enclosed in quotes",
+)
+parser.add_argument(
+    "--out",
+    default="./",
+    type=str,
+    help="Path to the output directory, where the files will be saved",
+)
+parser.add_argument("--name", default="random-whitespace-bpe", type=str, help="The name of the output vocab files")
+parser.add_argument("--split-prob", default=0.3, type=float, help="Probability of splitting at whitespace (0.0-1.0)")
+parser.add_argument("--vocab-size", default=10000, type=int, help="Size of vocabulary")
+parser.add_argument("--min-frequency", default=2, type=int, help="Minimum frequency for a token to be included")
+args = parser.parse_args()
+
+files = glob.glob(args.files)
+if not files:
+    print(f"File does not exist: {args.files}")
+    exit(1)
+
+
+# Initialize a tokenizer with BPE model
+tokenizer = Tokenizer(BPE())
+
+# Use RandomWhitespaceSplit as pre-tokenizer
+tokenizer.pre_tokenizer = RandomWhitespaceSplit(split_probability=args.split_prob)
+
+# Optional: Add NFKC normalization like SentencePieceBPE
+tokenizer.normalizer = normalizers.NFKC()
+
+# Configure the BPE trainer
+trainer = trainers.BpeTrainer(
+    vocab_size=args.vocab_size,
+    min_frequency=args.min_frequency,
+    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
+    show_progress=True
+)
+
+# Train the model
+print(f"Training BPE with RandomWhitespaceSplit (split_probability={args.split_prob})")
+tokenizer.train(files, trainer)
+
+# Save the trained tokenizer
+output_path = join(args.out, f"{args.name}.json")
+tokenizer.save(output_path)
+print(f"Trained tokenizer saved to: {output_path}")
+
+# Create an inference version without pre-tokenizer
+# First save to a temporary file
+temp_tokenizer_path = join(args.out, "temp_tokenizer.json")
+tokenizer.save(temp_tokenizer_path)
+
+# Read the JSON
+with open(temp_tokenizer_path, "r") as f:
+    tokenizer_data = json.load(f)
+
+# Remove pre-tokenizer field if present
+if "pre_tokenizer" in tokenizer_data:
+    del tokenizer_data["pre_tokenizer"]
+
+# Write modified tokenizer to inference file
+inference_path = join(args.out, f"{args.name}_inference.json")
+with open(inference_path, "w") as f:
+    json.dump(tokenizer_data, f, indent=2)
+
+# Clean up temp file
+os.remove(temp_tokenizer_path)
+
+print(f"Inference-ready tokenizer (no pre-tokenizer) saved to: {inference_path}")
+
+# Test encoding with inference tokenizer
+tokenizer = Tokenizer.from_file(inference_path)
+example = "Training BPE with multi-word tokens is very easy"
+print(f"\nTest encoding: {tokenizer.encode(example).tokens}")
+
diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py
@@ -8,6 +8,8 @@
 Digits = pre_tokenizers.Digits
 Metaspace = pre_tokenizers.Metaspace
 Punctuation = pre_tokenizers.Punctuation
+RandomChunkSplit = pre_tokenizers.RandomChunkSplit
+RandomWhitespaceSplit = pre_tokenizers.RandomWhitespaceSplit
 Sequence = pre_tokenizers.Sequence
 Split = pre_tokenizers.Split
 UnicodeScripts = pre_tokenizers.UnicodeScripts

diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
@@ -367,6 +367,59 @@ class Punctuation(PreTokenizer):
         """
         pass
 
+class RandomChunkSplit(PreTokenizer):
+    """
+    RandomChunkSplit PreTokenizer
+
+    This pre-tokenizer splits text into random-length chunks regardless of whitespace
+    boundaries. It's useful for enabling BPE to learn tokens that span across whitespace.
+
+    Args:
+        min_length (:obj:`int`, `optional`, defaults to :obj:`1`):
+            The minimum length (in characters) for each chunk.
+        max_length (:obj:`int`, `optional`, defaults to :obj:`5`):
+            The maximum length (in characters) for each chunk.
+    """
+    def __init__(self, min_length=1, max_length=5):
+        pass
+
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
+
 class Sequence(PreTokenizer):
     """
     This pre-tokenizer composes other pre_tokenizers and applies them in sequence
@@ -607,4 +660,4 @@ class WhitespaceSplit(PreTokenizer):
             :obj:`List[Tuple[str, Offsets]]`:
                 A list of tuple with the pre-tokenized parts and their offsets
         """
-        pass
+        pass