feat: Add sample codes for tokenizer extending

YeonwooSung · Aug 21, 2024 · 4295ba8 · 4295ba8
1 parent de314c0
commit 4295ba8
Show file tree

Hide file tree

Showing 6 changed files with 224 additions and 0 deletions.
diff --git a/Experiments/huggingface/tokenizer_extending/README.md b/Experiments/huggingface/tokenizer_extending/README.md
@@ -0,0 +1,20 @@
+# Extending existing AutoTokenizer with new tokens
+
+Extending existing tokenizer with new tokens from custom dataset does not always work.
+Also, once you extend the tokenizer, you really should fine-tune the model on the new dataset with the new tokens.
+
+Basically, extending the tokenizer increases the size of the embedding matrix, which will add newly initialized vectors at the end.
+Using these new embeddings untrained might already be useful, but usually at least some steps of fine-tuning are required.
+
+## Codes
+
+1. [Extending tokenizer with new tokens](./extend_tokenizer_with_new_words.py)
+2. [Adding emojis to tokenizer](./add_emojis_to_tokenizer.py) : Basically, print emojis as unicode characters to file, and then read them back in as tokens.
+3. [Adding multi-word expressions to tokenizer](./add_multiword_expressions_to_tokenizer.py)
+4. [Extending existing AutoTokenizer with new bpe-tokenized tokens](./extend_tokenizer_with_new_bpe_tokens.py)
+5. [Adding vocabs from one tokenizer to another](./adding_vocabs_from_tokenizer_to_another.py)
+
+## References
+
+* [How to add new tokens to huggingface transformers vocabulary](https://www.depends-on-the-definition.com/how-to-add-new-tokens-to-huggingface-transformers/)
+* [Extending existing AutoTokenizer with new tokens](https://stackoverflow.com/a/76198053/9012940)
diff --git a/Experiments/huggingface/tokenizer_extending/add_emojis_to_tokenizer.py b/Experiments/huggingface/tokenizer_extending/add_emojis_to_tokenizer.py
@@ -0,0 +1,37 @@
+import requests
+from transformers import AutoTokenizer, AutoModel
+
+
+response = requests.get('https://unicode.org/Public/emoji/13.0/emoji-sequences.txt')
+
+with open('emoji.txt', 'w') as fout:
+    for line in response.content.decode('utf8').split('\n'):
+        if line.strip() and not line.startswith('#'):
+            hexa = line.split(';')[0]
+            hexa = hexa.split('..')            
+            if len(hexa) == 1:
+                ch = ''.join([chr(int(h, 16)) for h in hexa[0].strip().split(' ')])
+                print(ch, end='\n', file=fout)
+            else:
+                start, end = hexa
+                for ch in range(int(start, 16), int(end, 16)+1):
+                    #ch = ''.join([chr(int(h, 16)) for h in ch.split(' ')])
+                    print(chr(ch), end='\n', file=fout)
+
+
+# pick the model type
+model_type = "bert-base-multilingual-cased"
+tokenizer = AutoTokenizer.from_pretrained(model_type)
+model = AutoModel.from_pretrained(model_type)
+
+# add emojis
+new_tokens = [e.strip() for e in open('emoji.txt')] 
+
+# check if the tokens are already in the vocabulary
+new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())
+
+# add the tokens to the tokenizer vocabulary
+tokenizer.add_tokens(list(new_tokens))
+
+# add new, random embeddings for the new tokens
+model.resize_token_embeddings(len(tokenizer))
diff --git a/Experiments/huggingface/tokenizer_extending/add_multiword_expressions_to_tokenizer.py b/Experiments/huggingface/tokenizer_extending/add_multiword_expressions_to_tokenizer.py
@@ -0,0 +1,105 @@
+import torch
+from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel
+from transformers.pipelines.token_classification import TokenClassificationPipeline
+try:
+    import tensorflow as tf
+except ImportError:
+    tf = None
+
+
+model_checkpoint = "Davlan/bert-base-multilingual-cased-ner-hrl"
+
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
+
+
+class TokenClassificationChunkPipeline(TokenClassificationPipeline):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+
+    def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
+        tokenizer_params = preprocess_params.pop("tokenizer_params", {})
+        truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
+        inputs = self.tokenizer(
+            sentence,
+            return_tensors="pt",
+            truncation=True,
+            return_special_tokens_mask=True,
+            return_offsets_mapping=True,
+            return_overflowing_tokens=True,  # Return multiple chunks
+            max_length=self.tokenizer.model_max_length,
+            padding=True
+        )
+        #inputs.pop("overflow_to_sample_mapping", None)
+        num_chunks = len(inputs["input_ids"])
+
+        for i in range(num_chunks):
+            if self.framework == "tf":
+                if tf is None:
+                    raise ImportError("Tensorflow is not available")
+                model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
+            else:
+                model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
+            if offset_mapping is not None:
+                model_inputs["offset_mapping"] = offset_mapping
+            model_inputs["sentence"] = sentence if i == 0 else None
+            model_inputs["is_last"] = i == num_chunks - 1
+            yield model_inputs
+
+
+    def _forward(self, model_inputs):
+        # Forward
+        special_tokens_mask = model_inputs.pop("special_tokens_mask")
+        offset_mapping = model_inputs.pop("offset_mapping", None)
+        sentence = model_inputs.pop("sentence")
+        is_last = model_inputs.pop("is_last")
+
+        overflow_to_sample_mapping = model_inputs.pop("overflow_to_sample_mapping")
+
+        output = self.model(**model_inputs)
+        logits = output["logits"] if isinstance(output, dict) else output[0]
+
+
+        model_outputs = {
+            "logits": logits,
+            "special_tokens_mask": special_tokens_mask,
+            "offset_mapping": offset_mapping,
+            "sentence": sentence,
+            "overflow_to_sample_mapping": overflow_to_sample_mapping,
+            "is_last": is_last,
+            **model_inputs,
+        }
+
+        # We reshape outputs to fit with the postprocess inputs
+        model_outputs["input_ids"] = torch.reshape(model_outputs["input_ids"], (1, -1))
+        model_outputs["token_type_ids"] = torch.reshape(model_outputs["token_type_ids"], (1, -1))
+        model_outputs["attention_mask"] = torch.reshape(model_outputs["attention_mask"], (1, -1))
+        model_outputs["special_tokens_mask"] = torch.reshape(model_outputs["special_tokens_mask"], (1, -1))
+        model_outputs["offset_mapping"] = torch.reshape(model_outputs["offset_mapping"], (1, -1, 2))
+
+        return model_outputs
+
+
+pipe = TokenClassificationChunkPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+
+mwe_tokens = set(
+  token['word'] for sent in pipe(
+    ["Bernard works at BNP Paribas in Paris.", "In New York, you will be a New Man"]) 
+  for token in sent
+)
+
+
+# pick the model type
+model_type = "bert-base-multilingual-cased"
+tokenizer = AutoTokenizer.from_pretrained(model_type)
+model = AutoModel.from_pretrained(model_type)
+
+# check if the tokens are already in the vocabulary
+new_tokens = set(mwe_tokens) - set(tokenizer.vocab.keys())
+
+# add the tokens to the tokenizer vocabulary
+tokenizer.add_tokens(list(new_tokens))
+
+# add new, random embeddings for the new tokens
+model.resize_token_embeddings(len(tokenizer))
diff --git a/Experiments/huggingface/tokenizer_extending/adding_vocabs_from_tokenizer_to_another.py b/Experiments/huggingface/tokenizer_extending/adding_vocabs_from_tokenizer_to_another.py
@@ -0,0 +1,18 @@
+from datasets import load_dataset
+
+from transformers import AutoTokenizer, AutoModel
+
+# pick the model type
+tokenizer1 = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
+tokenizer2 = AutoTokenizer.from_pretrained("roberta-base")
+
+print("Before adding roberta:", len(tokenizer1))
+
+tokens_in_roberta_not_in_bert = set(tokenizer2.vocab).difference(tokenizer1.vocab)
+tokenizer1.add_tokens(list(tokens_in_roberta_not_in_bert))
+
+print("After adding roberta:", len(tokenizer1))
+
+
+model = AutoModel.from_pretrained("bert-base-multilingual-cased")
+model.resize_token_embeddings(len(tokenizer1))
diff --git a/Experiments/huggingface/tokenizer_extending/extend_tokenizer_with_new_bpe_tokens.py b/Experiments/huggingface/tokenizer_extending/extend_tokenizer_with_new_bpe_tokens.py
@@ -0,0 +1,26 @@
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModel
+
+
+# pick the model type
+model_type = "bert-base-multilingual-cased"
+tokenizer = AutoTokenizer.from_pretrained(model_type)
+model = AutoModel.from_pretrained(model_type)
+
+# Original vocab size.
+print(len(tokenizer))
+# Note the outputs are 100s indices which points to unknown tokens.
+print(tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ"))
+
+
+# Take the first 1000 sentences from cc100. 
+am_dataset = load_dataset("cc100", "am")
+am_train = iter(am_dataset['train'][i]['text'] for i in range(1000))
+
+# Train a new tokenizer using the am_train and the old tokenizer object.
+new_tokenizer = tokenizer.train_new_from_iterator(am_train, vocab_size=100_000)
+
+tokenizer.add_tokens(list(new_tokenizer.vocab))
+
+print(new_tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ"))
+print(tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ"))
diff --git a/Experiments/huggingface/tokenizer_extending/extend_tokenizer_with_new_words.py b/Experiments/huggingface/tokenizer_extending/extend_tokenizer_with_new_words.py
@@ -0,0 +1,18 @@
+from transformers import AutoTokenizer, AutoModel
+
+# pick the model type
+model_type = "roberta-base"
+tokenizer = AutoTokenizer.from_pretrained(model_type)
+model = AutoModel.from_pretrained(model_type)
+
+# new tokens
+new_tokens = ["new_token"]
+
+# check if the tokens are already in the vocabulary
+new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())
+
+# add the tokens to the tokenizer vocabulary
+tokenizer.add_tokens(list(new_tokens))
+
+# add new, random embeddings for the new tokens
+model.resize_token_embeddings(len(tokenizer))