From b7947d19c711859af306a35382e9d691b71fcf7e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 21 Jan 2025 11:37:20 +0100
Subject: [PATCH 1/7] nits

---
 bindings/python/src/decoders.rs                  | 5 +++++
 bindings/python/tests/bindings/test_tokenizer.py | 3 +++
 tokenizers/src/tokenizer/mod.rs                  | 5 +++++
 3 files changed, 13 insertions(+)
diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
index 4a408ff1d..eea5563f6 100644
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -661,6 +661,11 @@ impl PyDecodeStream {
         }
     }
 
+    #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")]
+    fn from_sequence(&mut self, sequence_ids: Vec<u32>) {
+        self.ids = sequence_ids;
+    }
+
     #[pyo3(signature = (tokenizer, id), text_signature = "(self, tokenizer, id)")]
     fn step(&mut self, tokenizer: &PyTokenizer, id: u32) -> PyResult<Option<String>> {
         ToPyResult(tk::tokenizer::step_decode_stream(
diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
index d50f283e7..4cc8573c6 100644
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@@ -371,6 +371,9 @@ def test_decode(self):
         assert stream.step(tokenizer, 2) == " is"
         assert stream.step(tokenizer, 3) == " john"
 
+        stream.from_sequence([0, 1, 2, 3])
+        assert stream.step(tokenizer, 4) == "pair"
+
     def test_decode_stream(self):
         vocab = [
             ("<unk>", 0.0),
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
index 808d120d5..407713851 100644
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@@ -1072,6 +1072,11 @@ where
             &mut self.prefix_index,
         )
     }
+    
+    // Allows prefilling the tokenizer. Bit weird because not called in python
+    pub fn from_sequence(&mut self, sequence_ids: Vec<u32>){
+        self.ids = sequence_ids;
+    }
 }
 
 /// Internal function exposed only to bypass python limitations

From 2ce721bee66a39d01b3bf9301ef8fe180b1db65a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 21 Jan 2025 11:44:36 +0100
Subject: [PATCH 2/7] with

---
 bindings/python/src/decoders.rs                  | 2 +-
 bindings/python/tests/bindings/test_tokenizer.py | 2 +-
 tokenizers/src/tokenizer/mod.rs                  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
index eea5563f6..e541c9cd4 100644
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -662,7 +662,7 @@ impl PyDecodeStream {
     }
 
     #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")]
-    fn from_sequence(&mut self, sequence_ids: Vec<u32>) {
+    fn with_sequence(&mut self, sequence_ids: Vec<u32>) {
         self.ids = sequence_ids;
     }
 
diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
index 4cc8573c6..81f805a36 100644
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@@ -371,7 +371,7 @@ def test_decode(self):
         assert stream.step(tokenizer, 2) == " is"
         assert stream.step(tokenizer, 3) == " john"
 
-        stream.from_sequence([0, 1, 2, 3])
+        stream.with_sequence([0, 1, 2, 3])
         assert stream.step(tokenizer, 4) == "pair"
 
     def test_decode_stream(self):
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
index 407713851..3ad57f49e 100644
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@@ -1074,7 +1074,7 @@ where
     }
     
     // Allows prefilling the tokenizer. Bit weird because not called in python
-    pub fn from_sequence(&mut self, sequence_ids: Vec<u32>){
+    pub fn with_sequence(&mut self, sequence_ids: Vec<u32>){
         self.ids = sequence_ids;
     }
 }

From 46be05901553a3f24437ea4ee8d6d321a6ff4d6e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 21 Jan 2025 13:50:40 +0100
Subject: [PATCH 3/7] update

---
 bindings/python/src/decoders.rs | 2 ++
 tokenizers/src/tokenizer/mod.rs | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
index e541c9cd4..2165c6e74 100644
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -664,6 +664,8 @@ impl PyDecodeStream {
     #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")]
     fn with_sequence(&mut self, sequence_ids: Vec<u32>) {
         self.ids = sequence_ids;
+        self.prefix_index = 0;
+        self.prefix = "".to_string();
     }
 
     #[pyo3(signature = (tokenizer, id), text_signature = "(self, tokenizer, id)")]
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
index 3ad57f49e..9d1a2de90 100644
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@@ -1072,9 +1072,9 @@ where
             &mut self.prefix_index,
         )
     }
-    
+
     // Allows prefilling the tokenizer. Bit weird because not called in python
-    pub fn with_sequence(&mut self, sequence_ids: Vec<u32>){
+    pub fn with_sequence(&mut self, sequence_ids: Vec<u32>) {
         self.ids = sequence_ids;
     }
 }

From 69206e273075f3c061aa0896a7f171a2004fb75d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 21 Jan 2025 14:38:20 +0100
Subject: [PATCH 4/7] update

---
 bindings/python/src/decoders.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
index 2165c6e74..334e0ff27 100644
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -664,7 +664,7 @@ impl PyDecodeStream {
     #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")]
     fn with_sequence(&mut self, sequence_ids: Vec<u32>) {
         self.ids = sequence_ids;
-        self.prefix_index = 0;
+        self.prefix_index = sequence_ids.len();
         self.prefix = "".to_string();
     }
 

From 24d1068c061b193eb6abdb65892e742a7bf72250 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 21 Jan 2025 14:46:13 +0100
Subject: [PATCH 5/7] zut

---
 bindings/python/src/decoders.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
index 334e0ff27..dc490a1bb 100644
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -664,7 +664,7 @@ impl PyDecodeStream {
     #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")]
     fn with_sequence(&mut self, sequence_ids: Vec<u32>) {
         self.ids = sequence_ids;
-        self.prefix_index = sequence_ids.len();
+        self.prefix_index = &self.ids.len();
         self.prefix = "".to_string();
     }
 

From 3e1935761601d316de13124849d6b886dd412e46 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 21 Jan 2025 14:54:07 +0100
Subject: [PATCH 6/7] & bad

---
 bindings/python/src/decoders.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
index dc490a1bb..0cd1a92ce 100644
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -664,7 +664,7 @@ impl PyDecodeStream {
     #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")]
     fn with_sequence(&mut self, sequence_ids: Vec<u32>) {
         self.ids = sequence_ids;
-        self.prefix_index = &self.ids.len();
+        self.prefix_index = self.ids.len();
         self.prefix = "".to_string();
     }
 

From d1a7c66374ff28b6fc99e68779f351908519be56 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 21 Jan 2025 16:27:04 +0100
Subject: [PATCH 7/7] stub

---
 .../py_src/tokenizers/decoders/__init__.pyi   |  2 +-
 .../tokenizers/normalizers/__init__.pyi       |  2 +-
 .../tokenizers/pre_tokenizers/__init__.pyi    |  8 ++---
 bindings/python/src/decoders.rs               | 36 +++++++++----------
 .../python/tests/bindings/test_tokenizer.py   |  2 +-
 5 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi
index 672aebb8d..adad6f53b 100644
--- a/bindings/python/py_src/tokenizers/decoders/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/decoders/__init__.pyi
@@ -57,7 +57,7 @@ class ByteFallback(Decoder):
     ByteFallback Decoder
     ByteFallback is a simple trick which converts tokens looking like `<0x61>`
     to pure bytes, and attempts to make them into a string. If the tokens
-    cannot be decoded you will get � instead for each inconvertible byte token
+    cannot be decoded you will get � instead for each inconvertable byte token
 
     """
     def __init__(self):
diff --git a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi
index 1f5555104..8c4e744d1 100644
--- a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi
@@ -389,7 +389,7 @@ class Nmt(Normalizer):
 class Precompiled(Normalizer):
     """
     Precompiled normalizer
-    Don't use manually it is used for compatibility for SentencePiece.
+    Don't use manually it is used for compatiblity for SentencePiece.
     """
     def __init__(self, precompiled_charsmap):
         pass
diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
index 6f31ff3a2..ea1b4954e 100644
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
@@ -48,7 +48,7 @@ class BertPreTokenizer(PreTokenizer):
     BertPreTokenizer
 
     This pre-tokenizer splits tokens on spaces, and also on punctuation.
-    Each occurrence of a punctuation character will be treated separately.
+    Each occurence of a punctuation character will be treated separately.
     """
     def __init__(self):
         pass
@@ -421,11 +421,11 @@ class Split(PreTokenizer):
 
     Args:
         pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-            A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`.
-            If you want to use a regex pattern, it has to be wrapped around a `tokenizers.Regex`,
+            A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
+            If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
             otherwise we consider is as a string pattern. For example `pattern="|"`
             means you want to split on `|` (imagine a csv file for example), while
-            `pattern=tokenizers.Regex("1|2")` means you split on either '1' or '2'.
+            `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
         behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
             The behavior to use when splitting.
             Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
index 0cd1a92ce..59ebca5b4 100644
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -603,24 +603,6 @@ impl Decoder for PyDecoderWrapper {
     }
 }
 
-/// Decoders Module
-#[pymodule]
-pub fn decoders(m: &Bound<'_, PyModule>) -> PyResult<()> {
-    m.add_class::<PyDecoder>()?;
-    m.add_class::<PyByteLevelDec>()?;
-    m.add_class::<PyReplaceDec>()?;
-    m.add_class::<PyWordPieceDec>()?;
-    m.add_class::<PyByteFallbackDec>()?;
-    m.add_class::<PyFuseDec>()?;
-    m.add_class::<PyStrip>()?;
-    m.add_class::<PyMetaspaceDec>()?;
-    m.add_class::<PyBPEDecoder>()?;
-    m.add_class::<PyCTCDecoder>()?;
-    m.add_class::<PySequenceDecoder>()?;
-    m.add_class::<PyDecodeStream>()?;
-    Ok(())
-}
-
 /// Class needed for streaming decode
 ///
 #[pyclass(module = "tokenizers.decoders", name = "DecodeStream")]
@@ -682,6 +664,24 @@ impl PyDecodeStream {
     }
 }
 
+/// Decoders Module
+#[pymodule]
+pub fn decoders(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_class::<PyDecoder>()?;
+    m.add_class::<PyByteLevelDec>()?;
+    m.add_class::<PyReplaceDec>()?;
+    m.add_class::<PyWordPieceDec>()?;
+    m.add_class::<PyByteFallbackDec>()?;
+    m.add_class::<PyFuseDec>()?;
+    m.add_class::<PyStrip>()?;
+    m.add_class::<PyMetaspaceDec>()?;
+    m.add_class::<PyBPEDecoder>()?;
+    m.add_class::<PyCTCDecoder>()?;
+    m.add_class::<PySequenceDecoder>()?;
+    m.add_class::<PyDecodeStream>()?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod test {
     use std::sync::{Arc, RwLock};
diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
index 81f805a36..5050f60d3 100644
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@@ -372,7 +372,7 @@ def test_decode(self):
         assert stream.step(tokenizer, 3) == " john"
 
         stream.with_sequence([0, 1, 2, 3])
-        assert stream.step(tokenizer, 4) == "pair"
+        assert stream.step(tokenizer, 4) == "my name is john pair"
 
     def test_decode_stream(self):
         vocab = [