From b7947d19c711859af306a35382e9d691b71fcf7e Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 21 Jan 2025 11:37:20 +0100 Subject: [PATCH 1/7] nits --- bindings/python/src/decoders.rs | 5 +++++ bindings/python/tests/bindings/test_tokenizer.py | 3 +++ tokenizers/src/tokenizer/mod.rs | 5 +++++ 3 files changed, 13 insertions(+) diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 4a408ff1d..eea5563f6 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -661,6 +661,11 @@ impl PyDecodeStream { } } + #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")] + fn from_sequence(&mut self, sequence_ids: Vec) { + self.ids = sequence_ids; + } + #[pyo3(signature = (tokenizer, id), text_signature = "(self, tokenizer, id)")] fn step(&mut self, tokenizer: &PyTokenizer, id: u32) -> PyResult> { ToPyResult(tk::tokenizer::step_decode_stream( diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index d50f283e7..4cc8573c6 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -371,6 +371,9 @@ def test_decode(self): assert stream.step(tokenizer, 2) == " is" assert stream.step(tokenizer, 3) == " john" + stream.from_sequence([0, 1, 2, 3]) + assert stream.step(tokenizer, 4) == "pair" + def test_decode_stream(self): vocab = [ ("", 0.0), diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 808d120d5..407713851 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -1072,6 +1072,11 @@ where &mut self.prefix_index, ) } + + // Allows prefilling the tokenizer. Bit weird because not called in python + pub fn from_sequence(&mut self, sequence_ids: Vec){ + self.ids = sequence_ids; + } } /// Internal function exposed only to bypass python limitations From 2ce721bee66a39d01b3bf9301ef8fe180b1db65a Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 21 Jan 2025 11:44:36 +0100 Subject: [PATCH 2/7] with --- bindings/python/src/decoders.rs | 2 +- bindings/python/tests/bindings/test_tokenizer.py | 2 +- tokenizers/src/tokenizer/mod.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index eea5563f6..e541c9cd4 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -662,7 +662,7 @@ impl PyDecodeStream { } #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")] - fn from_sequence(&mut self, sequence_ids: Vec) { + fn with_sequence(&mut self, sequence_ids: Vec) { self.ids = sequence_ids; } diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 4cc8573c6..81f805a36 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -371,7 +371,7 @@ def test_decode(self): assert stream.step(tokenizer, 2) == " is" assert stream.step(tokenizer, 3) == " john" - stream.from_sequence([0, 1, 2, 3]) + stream.with_sequence([0, 1, 2, 3]) assert stream.step(tokenizer, 4) == "pair" def test_decode_stream(self): diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 407713851..3ad57f49e 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -1074,7 +1074,7 @@ where } // Allows prefilling the tokenizer. Bit weird because not called in python - pub fn from_sequence(&mut self, sequence_ids: Vec){ + pub fn with_sequence(&mut self, sequence_ids: Vec){ self.ids = sequence_ids; } } From 46be05901553a3f24437ea4ee8d6d321a6ff4d6e Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 21 Jan 2025 13:50:40 +0100 Subject: [PATCH 3/7] update --- bindings/python/src/decoders.rs | 2 ++ tokenizers/src/tokenizer/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index e541c9cd4..2165c6e74 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -664,6 +664,8 @@ impl PyDecodeStream { #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")] fn with_sequence(&mut self, sequence_ids: Vec) { self.ids = sequence_ids; + self.prefix_index = 0; + self.prefix = "".to_string(); } #[pyo3(signature = (tokenizer, id), text_signature = "(self, tokenizer, id)")] diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 3ad57f49e..9d1a2de90 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -1072,9 +1072,9 @@ where &mut self.prefix_index, ) } - + // Allows prefilling the tokenizer. Bit weird because not called in python - pub fn with_sequence(&mut self, sequence_ids: Vec){ + pub fn with_sequence(&mut self, sequence_ids: Vec) { self.ids = sequence_ids; } } From 69206e273075f3c061aa0896a7f171a2004fb75d Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 21 Jan 2025 14:38:20 +0100 Subject: [PATCH 4/7] update --- bindings/python/src/decoders.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 2165c6e74..334e0ff27 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -664,7 +664,7 @@ impl PyDecodeStream { #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")] fn with_sequence(&mut self, sequence_ids: Vec) { self.ids = sequence_ids; - self.prefix_index = 0; + self.prefix_index = sequence_ids.len(); self.prefix = "".to_string(); } From 24d1068c061b193eb6abdb65892e742a7bf72250 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 21 Jan 2025 14:46:13 +0100 Subject: [PATCH 5/7] zut --- bindings/python/src/decoders.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 334e0ff27..dc490a1bb 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -664,7 +664,7 @@ impl PyDecodeStream { #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")] fn with_sequence(&mut self, sequence_ids: Vec) { self.ids = sequence_ids; - self.prefix_index = sequence_ids.len(); + self.prefix_index = &self.ids.len(); self.prefix = "".to_string(); } From 3e1935761601d316de13124849d6b886dd412e46 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 21 Jan 2025 14:54:07 +0100 Subject: [PATCH 6/7] & bad --- bindings/python/src/decoders.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index dc490a1bb..0cd1a92ce 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -664,7 +664,7 @@ impl PyDecodeStream { #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")] fn with_sequence(&mut self, sequence_ids: Vec) { self.ids = sequence_ids; - self.prefix_index = &self.ids.len(); + self.prefix_index = self.ids.len(); self.prefix = "".to_string(); } From d1a7c66374ff28b6fc99e68779f351908519be56 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 21 Jan 2025 16:27:04 +0100 Subject: [PATCH 7/7] stub --- .../py_src/tokenizers/decoders/__init__.pyi | 2 +- .../tokenizers/normalizers/__init__.pyi | 2 +- .../tokenizers/pre_tokenizers/__init__.pyi | 8 ++--- bindings/python/src/decoders.rs | 36 +++++++++---------- .../python/tests/bindings/test_tokenizer.py | 2 +- 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi index 672aebb8d..adad6f53b 100644 --- a/bindings/python/py_src/tokenizers/decoders/__init__.pyi +++ b/bindings/python/py_src/tokenizers/decoders/__init__.pyi @@ -57,7 +57,7 @@ class ByteFallback(Decoder): ByteFallback Decoder ByteFallback is a simple trick which converts tokens looking like `<0x61>` to pure bytes, and attempts to make them into a string. If the tokens - cannot be decoded you will get � instead for each inconvertible byte token + cannot be decoded you will get � instead for each inconvertable byte token """ def __init__(self): diff --git a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi index 1f5555104..8c4e744d1 100644 --- a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi @@ -389,7 +389,7 @@ class Nmt(Normalizer): class Precompiled(Normalizer): """ Precompiled normalizer - Don't use manually it is used for compatibility for SentencePiece. + Don't use manually it is used for compatiblity for SentencePiece. """ def __init__(self, precompiled_charsmap): pass diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index 6f31ff3a2..ea1b4954e 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -48,7 +48,7 @@ class BertPreTokenizer(PreTokenizer): BertPreTokenizer This pre-tokenizer splits tokens on spaces, and also on punctuation. - Each occurrence of a punctuation character will be treated separately. + Each occurence of a punctuation character will be treated separately. """ def __init__(self): pass @@ -421,11 +421,11 @@ class Split(PreTokenizer): Args: pattern (:obj:`str` or :class:`~tokenizers.Regex`): - A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`. - If you want to use a regex pattern, it has to be wrapped around a `tokenizers.Regex`, + A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. + If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, otherwise we consider is as a string pattern. For example `pattern="|"` means you want to split on `|` (imagine a csv file for example), while - `pattern=tokenizers.Regex("1|2")` means you split on either '1' or '2'. + `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'. behavior (:class:`~tokenizers.SplitDelimiterBehavior`): The behavior to use when splitting. Choices: "removed", "isolated", "merged_with_previous", "merged_with_next", diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 0cd1a92ce..59ebca5b4 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -603,24 +603,6 @@ impl Decoder for PyDecoderWrapper { } } -/// Decoders Module -#[pymodule] -pub fn decoders(m: &Bound<'_, PyModule>) -> PyResult<()> { - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - Ok(()) -} - /// Class needed for streaming decode /// #[pyclass(module = "tokenizers.decoders", name = "DecodeStream")] @@ -682,6 +664,24 @@ impl PyDecodeStream { } } +/// Decoders Module +#[pymodule] +pub fn decoders(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + Ok(()) +} + #[cfg(test)] mod test { use std::sync::{Arc, RwLock}; diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 81f805a36..5050f60d3 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -372,7 +372,7 @@ def test_decode(self): assert stream.step(tokenizer, 3) == " john" stream.with_sequence([0, 1, 2, 3]) - assert stream.step(tokenizer, 4) == "pair" + assert stream.step(tokenizer, 4) == "my name is john pair" def test_decode_stream(self): vocab = [