Fix typos in strings and comments (#1770)

co63oc · web-flow · commit f1faec175698 · 2025-05-27T08:17:36.000+02:00
diff --git a/bindings/node/lib/bindings/encoding.test.ts b/bindings/node/lib/bindings/encoding.test.ts
@@ -122,7 +122,7 @@ describe('Encoding', () => {
       expect(indexes).toEqual([3, 5])
     })
 
-    it('returns the corrent indexes with pair sequences', () => {
+    it('returns the correct indexes with pair sequences', () => {
       expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5])
       expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9])
     })
diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
@@ -27,4 +27,4 @@ tempfile = "3.10"
 pyo3 = { version = "0.23", features = ["auto-initialize"] }
 
 [features]
-defaut = ["pyo3/extension-module"]
+default = ["pyo3/extension-module"]
diff --git a/bindings/python/scripts/convert.py b/bindings/python/scripts/convert.py
@@ -397,7 +397,7 @@ def main():
         "--models",
         type=lambda s: s.split(","),
         default=pretraineds,
-        help=f"The pretrained tokenizers you want to test agains, (default: {pretraineds})",
+        help=f"The pretrained tokenizers you want to test against, (default: {pretraineds})",
     )
     args = parser.parse_args()
 
diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
@@ -404,7 +404,7 @@ impl PyMetaspaceDec {
 ///
 /// Args:
 ///     suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
-///         The suffix that was used to caracterize an end-of-word. This suffix will
+///         The suffix that was used to characterize an end-of-word. This suffix will
 ///         be replaced by whitespaces during the decoding
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
 pub struct PyBPEDecoder {}
diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs
@@ -221,7 +221,7 @@ pub struct BPE {
     pub unk_token: Option<String>,
     /// An optional prefix to use on any subword that exist only behind another one
     pub continuing_subword_prefix: Option<String>,
-    /// An optional suffix to caracterize and end-of-word subword
+    /// An optional suffix to characterize and end-of-word subword
     pub end_of_word_suffix: Option<String>,
     /// Do multiple unk tokens get fused
     pub fuse_unk: bool,
diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs
@@ -190,7 +190,7 @@ pub struct BpeTrainer {
     pub initial_alphabet: HashSet<char>,
     /// An optional prefix to use on any subword that exist only behind another one
     pub continuing_subword_prefix: Option<String>,
-    /// An optional suffix to caracterize and end-of-word subword
+    /// An optional suffix to characterize and end-of-word subword
     pub end_of_word_suffix: Option<String>,
     /// An optional parameter to limit the max length of any single token
     pub max_token_length: Option<usize>,
diff --git a/tokenizers/src/models/unigram/trainer.rs b/tokenizers/src/models/unigram/trainer.rs
@@ -401,7 +401,7 @@ impl UnigramTrainer {
 
                 let logsum_alt = (sum + freq[id] * (alternatives.len() - 1) as f64).ln();
 
-                // The frequencies of altenatives are increased by freq[i].
+                // The frequencies of alternatives are increased by freq[i].
                 let mut logprob_alt = 0.0;
                 for n in &alternatives[id] {
                     logprob_alt += (freq[*n] + freq[id]).ln() - logsum_alt;
diff --git a/tokenizers/src/models/wordlevel/mod.rs b/tokenizers/src/models/wordlevel/mod.rs
@@ -73,7 +73,7 @@ impl WordLevelBuilder {
         self
     }
 
-    /// Contructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
+    /// Constructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
     pub fn build(mut self) -> Result<WordLevel> {
         if let Some(vocab) = self.config.files {
             self.config.vocab = WordLevel::read_file(&vocab)?;
diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs
@@ -93,7 +93,7 @@ impl WordPieceBuilder {
         self
     }
 
-    /// Contructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
+    /// Constructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
     pub fn build(mut self) -> Result<WordPiece> {
         if let Some(vocab) = self.config.files {
             self.config.vocab = WordPiece::read_file(&vocab)?;
diff --git a/tokenizers/src/models/wordpiece/trainer.rs b/tokenizers/src/models/wordpiece/trainer.rs
@@ -170,7 +170,7 @@ impl WordPieceTrainer {
         // Transfer the vocab
         model.vocab = new_wordpiece.vocab;
         model.vocab_r = new_wordpiece.vocab_r;
-        // The continuing_subword_prefix is the only other option to be overriden by the trainer
+        // The continuing_subword_prefix is the only other option to be overridden by the trainer
         model.continuing_subword_prefix = new_wordpiece.continuing_subword_prefix;
 
         Ok(special_tokens)
diff --git a/tokenizers/src/normalizers/precompiled.rs b/tokenizers/src/normalizers/precompiled.rs
@@ -12,7 +12,7 @@ fn replace(transformations: &mut Vec<(char, isize)>, old_part: &str, new_part: &
     transformations.extend(new_part.chars().map(|c| (c, 0)));
 
     match diff.cmp(&0) {
-        // If we are adding some characters, the last DIFF characters shoud be == 1
+        // If we are adding some characters, the last DIFF characters should be == 1
         Ordering::Greater => {
             transformations
                 .iter_mut()
diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -29,7 +29,7 @@ pub struct AddedToken {
 }
 
 impl AddedToken {
-    /// Build this token from the given content, specifying if it is intented to be a
+    /// Build this token from the given content, specifying if it is intended to be a
     /// special token. Special tokens are not normalized by default.
     pub fn from<S: Into<String>>(content: S, special: bool) -> Self {
         Self {
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
@@ -389,7 +389,7 @@ where
         self
     }
 
-    /// Set the trunaction parameters.
+    /// Set the truncation parameters.
     #[must_use]
     pub fn with_truncation(mut self, trunc: Option<TruncationParams>) -> Self {
         self.truncation = trunc;
diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs
@@ -201,9 +201,9 @@ impl NormalizedString {
                 });
 
             match (start, end) {
-                // Targeting inexistant beginning
+                // Targeting inexistent beginning
                 (Some(s), None) => Some(s..s),
-                // Targeting inexistant end
+                // Targeting inexistent end
                 (None, Some(e)) => Some(e..e),
                 // Found the range
                 (Some(s), Some(e)) => Some(s..e),
diff --git a/tokenizers/src/tokenizer/pattern.rs b/tokenizers/src/tokenizer/pattern.rs
@@ -122,7 +122,7 @@ where
     }
 }
 
-/// Invert the `is_match` flags for the wrapped Pattern. This is usefull
+/// Invert the `is_match` flags for the wrapped Pattern. This is useful
 /// for example when we use a regex that matches words instead of a delimiter,
 /// and we want to match the delimiter.
 pub struct Invert<P: Pattern>(pub P);

Original file line number	Diff line number	Diff line change
`@@ -397,7 +397,7 @@ def main():`
`397`	`397`	`"--models",`
`398`	`398`	`type=lambda s: s.split(","),`
`399`	`399`	`default=pretraineds,`
`400`		`- help=f"The pretrained tokenizers you want to test agains, (default: {pretraineds})",`
	`400`	`+ help=f"The pretrained tokenizers you want to test against, (default: {pretraineds})",`
`401`	`401`	`)`
`402`	`402`	`args = parser.parse_args()`
`403`	`403`
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ impl WordLevelBuilder {`
`73`	`73`	`self`
`74`	`74`	`}`
`75`	`75`
`76`		- /// Contructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
	`76`	+ /// Constructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
`77`	`77`	`pub fn build(mut self) -> Result<WordLevel> {`
`78`	`78`	`if let Some(vocab) = self.config.files {`
`79`	`79`	`self.config.vocab = WordLevel::read_file(&vocab)?;`
Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@ impl WordPieceBuilder {`
`93`	`93`	`self`
`94`	`94`	`}`
`95`	`95`
`96`		- /// Contructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
	`96`	+ /// Constructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
`97`	`97`	`pub fn build(mut self) -> Result<WordPiece> {`
`98`	`98`	`if let Some(vocab) = self.config.files {`
`99`	`99`	`self.config.vocab = WordPiece::read_file(&vocab)?;`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ pub struct AddedToken {`
`29`	`29`	`}`
`30`	`30`
`31`	`31`	`impl AddedToken {`
`32`		`- /// Build this token from the given content, specifying if it is intented to be a`
	`32`	`+ /// Build this token from the given content, specifying if it is intended to be a`
`33`	`33`	`/// special token. Special tokens are not normalized by default.`
`34`	`34`	`pub fn from<S: Into<String>>(content: S, special: bool) -> Self {`
`35`	`35`	`Self {`
Original file line number	Diff line number	Diff line change
`@@ -389,7 +389,7 @@ where`
`389`	`389`	`self`
`390`	`390`	`}`
`391`	`391`
`392`		`- /// Set the trunaction parameters.`
	`392`	`+ /// Set the truncation parameters.`
`393`	`393`	`#[must_use]`
`394`	`394`	`pub fn with_truncation(mut self, trunc: Option<TruncationParams>) -> Self {`
`395`	`395`	`self.truncation = trunc;`
Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,7 @@ where`
`122`	`122`	`}`
`123`	`123`	`}`
`124`	`124`
`125`		-/// Invert the `is_match` flags for the wrapped Pattern. This is usefull
	`125`	+/// Invert the `is_match` flags for the wrapped Pattern. This is useful
`126`	`126`	`/// for example when we use a regex that matches words instead of a delimiter,`
`127`	`127`	`/// and we want to match the delimiter.`
`128`	`128`	`pub struct Invert<P: Pattern>(pub P);`