From 43c31b94e906bebd4549faa3a54d02d9f603a9e5 Mon Sep 17 00:00:00 2001 From: co63oc Date: Fri, 25 Apr 2025 20:51:44 +0800 Subject: [PATCH] Fix typos in strings and comments --- bindings/node/lib/bindings/encoding.test.ts | 2 +- bindings/python/Cargo.toml | 2 +- bindings/python/scripts/convert.py | 2 +- bindings/python/src/decoders.rs | 2 +- tokenizers/src/models/bpe/model.rs | 2 +- tokenizers/src/models/bpe/trainer.rs | 2 +- tokenizers/src/models/unigram/trainer.rs | 2 +- tokenizers/src/models/wordlevel/mod.rs | 2 +- tokenizers/src/models/wordpiece/mod.rs | 2 +- tokenizers/src/models/wordpiece/trainer.rs | 2 +- tokenizers/src/normalizers/precompiled.rs | 2 +- tokenizers/src/tokenizer/added_vocabulary.rs | 2 +- tokenizers/src/tokenizer/mod.rs | 2 +- tokenizers/src/tokenizer/normalizer.rs | 4 ++-- tokenizers/src/tokenizer/pattern.rs | 2 +- 15 files changed, 16 insertions(+), 16 deletions(-) diff --git a/bindings/node/lib/bindings/encoding.test.ts b/bindings/node/lib/bindings/encoding.test.ts index b1d84fef1..5f818cf68 100644 --- a/bindings/node/lib/bindings/encoding.test.ts +++ b/bindings/node/lib/bindings/encoding.test.ts @@ -122,7 +122,7 @@ describe('Encoding', () => { expect(indexes).toEqual([3, 5]) }) - it('returns the corrent indexes with pair sequences', () => { + it('returns the correct indexes with pair sequences', () => { expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5]) expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9]) }) diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 6e8b0c34c..2c1daae83 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -27,4 +27,4 @@ tempfile = "3.10" pyo3 = { version = "0.23", features = ["auto-initialize"] } [features] -defaut = ["pyo3/extension-module"] +default = ["pyo3/extension-module"] diff --git a/bindings/python/scripts/convert.py b/bindings/python/scripts/convert.py index 50c13862b..67c41075d 100644 --- a/bindings/python/scripts/convert.py +++ b/bindings/python/scripts/convert.py @@ -397,7 +397,7 @@ def main(): "--models", type=lambda s: s.split(","), default=pretraineds, - help=f"The pretrained tokenizers you want to test agains, (default: {pretraineds})", + help=f"The pretrained tokenizers you want to test against, (default: {pretraineds})", ) args = parser.parse_args() diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 4a408ff1d..d85289a25 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -404,7 +404,7 @@ impl PyMetaspaceDec { /// /// Args: /// suffix (:obj:`str`, `optional`, defaults to :obj:``): -/// The suffix that was used to caracterize an end-of-word. This suffix will +/// The suffix that was used to characterize an end-of-word. This suffix will /// be replaced by whitespaces during the decoding #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")] pub struct PyBPEDecoder {} diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs index 217c37e90..50c9815e9 100644 --- a/tokenizers/src/models/bpe/model.rs +++ b/tokenizers/src/models/bpe/model.rs @@ -221,7 +221,7 @@ pub struct BPE { pub unk_token: Option, /// An optional prefix to use on any subword that exist only behind another one pub continuing_subword_prefix: Option, - /// An optional suffix to caracterize and end-of-word subword + /// An optional suffix to characterize and end-of-word subword pub end_of_word_suffix: Option, /// Do multiple unk tokens get fused pub fuse_unk: bool, diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs index a1a0aba76..2484865be 100644 --- a/tokenizers/src/models/bpe/trainer.rs +++ b/tokenizers/src/models/bpe/trainer.rs @@ -190,7 +190,7 @@ pub struct BpeTrainer { pub initial_alphabet: HashSet, /// An optional prefix to use on any subword that exist only behind another one pub continuing_subword_prefix: Option, - /// An optional suffix to caracterize and end-of-word subword + /// An optional suffix to characterize and end-of-word subword pub end_of_word_suffix: Option, /// An optional parameter to limit the max length of any single token pub max_token_length: Option, diff --git a/tokenizers/src/models/unigram/trainer.rs b/tokenizers/src/models/unigram/trainer.rs index 5d178e77b..d6d2830fd 100644 --- a/tokenizers/src/models/unigram/trainer.rs +++ b/tokenizers/src/models/unigram/trainer.rs @@ -401,7 +401,7 @@ impl UnigramTrainer { let logsum_alt = (sum + freq[id] * (alternatives.len() - 1) as f64).ln(); - // The frequencies of altenatives are increased by freq[i]. + // The frequencies of alternatives are increased by freq[i]. let mut logprob_alt = 0.0; for n in &alternatives[id] { logprob_alt += (freq[*n] + freq[id]).ln() - logsum_alt; diff --git a/tokenizers/src/models/wordlevel/mod.rs b/tokenizers/src/models/wordlevel/mod.rs index 545db13a7..dd66de08e 100644 --- a/tokenizers/src/models/wordlevel/mod.rs +++ b/tokenizers/src/models/wordlevel/mod.rs @@ -73,7 +73,7 @@ impl WordLevelBuilder { self } - /// Contructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration. + /// Constructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration. pub fn build(mut self) -> Result { if let Some(vocab) = self.config.files { self.config.vocab = WordLevel::read_file(&vocab)?; diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs index 0c63405c1..5c06bd4b3 100644 --- a/tokenizers/src/models/wordpiece/mod.rs +++ b/tokenizers/src/models/wordpiece/mod.rs @@ -93,7 +93,7 @@ impl WordPieceBuilder { self } - /// Contructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration. + /// Constructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration. pub fn build(mut self) -> Result { if let Some(vocab) = self.config.files { self.config.vocab = WordPiece::read_file(&vocab)?; diff --git a/tokenizers/src/models/wordpiece/trainer.rs b/tokenizers/src/models/wordpiece/trainer.rs index 58a5abc8f..f2e79baee 100644 --- a/tokenizers/src/models/wordpiece/trainer.rs +++ b/tokenizers/src/models/wordpiece/trainer.rs @@ -170,7 +170,7 @@ impl WordPieceTrainer { // Transfer the vocab model.vocab = new_wordpiece.vocab; model.vocab_r = new_wordpiece.vocab_r; - // The continuing_subword_prefix is the only other option to be overriden by the trainer + // The continuing_subword_prefix is the only other option to be overridden by the trainer model.continuing_subword_prefix = new_wordpiece.continuing_subword_prefix; Ok(special_tokens) diff --git a/tokenizers/src/normalizers/precompiled.rs b/tokenizers/src/normalizers/precompiled.rs index c14f86c07..90b484a81 100644 --- a/tokenizers/src/normalizers/precompiled.rs +++ b/tokenizers/src/normalizers/precompiled.rs @@ -12,7 +12,7 @@ fn replace(transformations: &mut Vec<(char, isize)>, old_part: &str, new_part: & transformations.extend(new_part.chars().map(|c| (c, 0))); match diff.cmp(&0) { - // If we are adding some characters, the last DIFF characters shoud be == 1 + // If we are adding some characters, the last DIFF characters should be == 1 Ordering::Greater => { transformations .iter_mut() diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index f988477be..f18b4529e 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -29,7 +29,7 @@ pub struct AddedToken { } impl AddedToken { - /// Build this token from the given content, specifying if it is intented to be a + /// Build this token from the given content, specifying if it is intended to be a /// special token. Special tokens are not normalized by default. pub fn from>(content: S, special: bool) -> Self { Self { diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 808d120d5..f4a136091 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -389,7 +389,7 @@ where self } - /// Set the trunaction parameters. + /// Set the truncation parameters. #[must_use] pub fn with_truncation(mut self, trunc: Option) -> Self { self.truncation = trunc; diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index 432c6cc69..7f50d9c97 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -201,9 +201,9 @@ impl NormalizedString { }); match (start, end) { - // Targeting inexistant beginning + // Targeting inexistent beginning (Some(s), None) => Some(s..s), - // Targeting inexistant end + // Targeting inexistent end (None, Some(e)) => Some(e..e), // Found the range (Some(s), Some(e)) => Some(s..e), diff --git a/tokenizers/src/tokenizer/pattern.rs b/tokenizers/src/tokenizer/pattern.rs index 9fa22dd9b..a2a2f1684 100644 --- a/tokenizers/src/tokenizer/pattern.rs +++ b/tokenizers/src/tokenizer/pattern.rs @@ -122,7 +122,7 @@ where } } -/// Invert the `is_match` flags for the wrapped Pattern. This is usefull +/// Invert the `is_match` flags for the wrapped Pattern. This is useful /// for example when we use a regex that matches words instead of a delimiter, /// and we want to match the delimiter. pub struct Invert(pub P);