diff --git a/Experiments/huggingface/tokenizer_extending/README.md b/Experiments/huggingface/tokenizer_extending/README.md new file mode 100644 index 00000000..278a2aa7 --- /dev/null +++ b/Experiments/huggingface/tokenizer_extending/README.md @@ -0,0 +1,20 @@ +# Extending existing AutoTokenizer with new tokens + +Extending existing tokenizer with new tokens from custom dataset does not always work. +Also, once you extend the tokenizer, you really should fine-tune the model on the new dataset with the new tokens. + +Basically, extending the tokenizer increases the size of the embedding matrix, which will add newly initialized vectors at the end. +Using these new embeddings untrained might already be useful, but usually at least some steps of fine-tuning are required. + +## Codes + +1. [Extending tokenizer with new tokens](./extend_tokenizer_with_new_words.py) +2. [Adding emojis to tokenizer](./add_emojis_to_tokenizer.py) : Basically, print emojis as unicode characters to file, and then read them back in as tokens. +3. [Adding multi-word expressions to tokenizer](./add_multiword_expressions_to_tokenizer.py) +4. [Extending existing AutoTokenizer with new bpe-tokenized tokens](./extend_tokenizer_with_new_bpe_tokens.py) +5. [Adding vocabs from one tokenizer to another](./adding_vocabs_from_tokenizer_to_another.py) + +## References + +* [How to add new tokens to huggingface transformers vocabulary](https://www.depends-on-the-definition.com/how-to-add-new-tokens-to-huggingface-transformers/) +* [Extending existing AutoTokenizer with new tokens](https://stackoverflow.com/a/76198053/9012940) diff --git a/Experiments/huggingface/tokenizer_extending/add_emojis_to_tokenizer.py b/Experiments/huggingface/tokenizer_extending/add_emojis_to_tokenizer.py new file mode 100644 index 00000000..56471c45 --- /dev/null +++ b/Experiments/huggingface/tokenizer_extending/add_emojis_to_tokenizer.py @@ -0,0 +1,37 @@ +import requests +from transformers import AutoTokenizer, AutoModel + + +response = requests.get('https://unicode.org/Public/emoji/13.0/emoji-sequences.txt') + +with open('emoji.txt', 'w') as fout: + for line in response.content.decode('utf8').split('\n'): + if line.strip() and not line.startswith('#'): + hexa = line.split(';')[0] + hexa = hexa.split('..') + if len(hexa) == 1: + ch = ''.join([chr(int(h, 16)) for h in hexa[0].strip().split(' ')]) + print(ch, end='\n', file=fout) + else: + start, end = hexa + for ch in range(int(start, 16), int(end, 16)+1): + #ch = ''.join([chr(int(h, 16)) for h in ch.split(' ')]) + print(chr(ch), end='\n', file=fout) + + +# pick the model type +model_type = "bert-base-multilingual-cased" +tokenizer = AutoTokenizer.from_pretrained(model_type) +model = AutoModel.from_pretrained(model_type) + +# add emojis +new_tokens = [e.strip() for e in open('emoji.txt')] + +# check if the tokens are already in the vocabulary +new_tokens = set(new_tokens) - set(tokenizer.vocab.keys()) + +# add the tokens to the tokenizer vocabulary +tokenizer.add_tokens(list(new_tokens)) + +# add new, random embeddings for the new tokens +model.resize_token_embeddings(len(tokenizer)) diff --git a/Experiments/huggingface/tokenizer_extending/add_multiword_expressions_to_tokenizer.py b/Experiments/huggingface/tokenizer_extending/add_multiword_expressions_to_tokenizer.py new file mode 100644 index 00000000..e608afd5 --- /dev/null +++ b/Experiments/huggingface/tokenizer_extending/add_multiword_expressions_to_tokenizer.py @@ -0,0 +1,105 @@ +import torch +from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel +from transformers.pipelines.token_classification import TokenClassificationPipeline +try: + import tensorflow as tf +except ImportError: + tf = None + + +model_checkpoint = "Davlan/bert-base-multilingual-cased-ner-hrl" + +tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) +model = AutoModelForTokenClassification.from_pretrained(model_checkpoint) + + +class TokenClassificationChunkPipeline(TokenClassificationPipeline): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + + def preprocess(self, sentence, offset_mapping=None, **preprocess_params): + tokenizer_params = preprocess_params.pop("tokenizer_params", {}) + truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False + inputs = self.tokenizer( + sentence, + return_tensors="pt", + truncation=True, + return_special_tokens_mask=True, + return_offsets_mapping=True, + return_overflowing_tokens=True, # Return multiple chunks + max_length=self.tokenizer.model_max_length, + padding=True + ) + #inputs.pop("overflow_to_sample_mapping", None) + num_chunks = len(inputs["input_ids"]) + + for i in range(num_chunks): + if self.framework == "tf": + if tf is None: + raise ImportError("Tensorflow is not available") + model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()} + else: + model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()} + if offset_mapping is not None: + model_inputs["offset_mapping"] = offset_mapping + model_inputs["sentence"] = sentence if i == 0 else None + model_inputs["is_last"] = i == num_chunks - 1 + yield model_inputs + + + def _forward(self, model_inputs): + # Forward + special_tokens_mask = model_inputs.pop("special_tokens_mask") + offset_mapping = model_inputs.pop("offset_mapping", None) + sentence = model_inputs.pop("sentence") + is_last = model_inputs.pop("is_last") + + overflow_to_sample_mapping = model_inputs.pop("overflow_to_sample_mapping") + + output = self.model(**model_inputs) + logits = output["logits"] if isinstance(output, dict) else output[0] + + + model_outputs = { + "logits": logits, + "special_tokens_mask": special_tokens_mask, + "offset_mapping": offset_mapping, + "sentence": sentence, + "overflow_to_sample_mapping": overflow_to_sample_mapping, + "is_last": is_last, + **model_inputs, + } + + # We reshape outputs to fit with the postprocess inputs + model_outputs["input_ids"] = torch.reshape(model_outputs["input_ids"], (1, -1)) + model_outputs["token_type_ids"] = torch.reshape(model_outputs["token_type_ids"], (1, -1)) + model_outputs["attention_mask"] = torch.reshape(model_outputs["attention_mask"], (1, -1)) + model_outputs["special_tokens_mask"] = torch.reshape(model_outputs["special_tokens_mask"], (1, -1)) + model_outputs["offset_mapping"] = torch.reshape(model_outputs["offset_mapping"], (1, -1, 2)) + + return model_outputs + + +pipe = TokenClassificationChunkPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple") + +mwe_tokens = set( + token['word'] for sent in pipe( + ["Bernard works at BNP Paribas in Paris.", "In New York, you will be a New Man"]) + for token in sent +) + + +# pick the model type +model_type = "bert-base-multilingual-cased" +tokenizer = AutoTokenizer.from_pretrained(model_type) +model = AutoModel.from_pretrained(model_type) + +# check if the tokens are already in the vocabulary +new_tokens = set(mwe_tokens) - set(tokenizer.vocab.keys()) + +# add the tokens to the tokenizer vocabulary +tokenizer.add_tokens(list(new_tokens)) + +# add new, random embeddings for the new tokens +model.resize_token_embeddings(len(tokenizer)) diff --git a/Experiments/huggingface/tokenizer_extending/adding_vocabs_from_tokenizer_to_another.py b/Experiments/huggingface/tokenizer_extending/adding_vocabs_from_tokenizer_to_another.py new file mode 100644 index 00000000..0c50f85e --- /dev/null +++ b/Experiments/huggingface/tokenizer_extending/adding_vocabs_from_tokenizer_to_another.py @@ -0,0 +1,18 @@ +from datasets import load_dataset + +from transformers import AutoTokenizer, AutoModel + +# pick the model type +tokenizer1 = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") +tokenizer2 = AutoTokenizer.from_pretrained("roberta-base") + +print("Before adding roberta:", len(tokenizer1)) + +tokens_in_roberta_not_in_bert = set(tokenizer2.vocab).difference(tokenizer1.vocab) +tokenizer1.add_tokens(list(tokens_in_roberta_not_in_bert)) + +print("After adding roberta:", len(tokenizer1)) + + +model = AutoModel.from_pretrained("bert-base-multilingual-cased") +model.resize_token_embeddings(len(tokenizer1)) diff --git a/Experiments/huggingface/tokenizer_extending/extend_tokenizer_with_new_bpe_tokens.py b/Experiments/huggingface/tokenizer_extending/extend_tokenizer_with_new_bpe_tokens.py new file mode 100644 index 00000000..8afd0325 --- /dev/null +++ b/Experiments/huggingface/tokenizer_extending/extend_tokenizer_with_new_bpe_tokens.py @@ -0,0 +1,26 @@ +from datasets import load_dataset +from transformers import AutoTokenizer, AutoModel + + +# pick the model type +model_type = "bert-base-multilingual-cased" +tokenizer = AutoTokenizer.from_pretrained(model_type) +model = AutoModel.from_pretrained(model_type) + +# Original vocab size. +print(len(tokenizer)) +# Note the outputs are 100s indices which points to unknown tokens. +print(tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ")) + + +# Take the first 1000 sentences from cc100. +am_dataset = load_dataset("cc100", "am") +am_train = iter(am_dataset['train'][i]['text'] for i in range(1000)) + +# Train a new tokenizer using the am_train and the old tokenizer object. +new_tokenizer = tokenizer.train_new_from_iterator(am_train, vocab_size=100_000) + +tokenizer.add_tokens(list(new_tokenizer.vocab)) + +print(new_tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ")) +print(tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ")) diff --git a/Experiments/huggingface/tokenizer_extending/extend_tokenizer_with_new_words.py b/Experiments/huggingface/tokenizer_extending/extend_tokenizer_with_new_words.py new file mode 100644 index 00000000..70f94de9 --- /dev/null +++ b/Experiments/huggingface/tokenizer_extending/extend_tokenizer_with_new_words.py @@ -0,0 +1,18 @@ +from transformers import AutoTokenizer, AutoModel + +# pick the model type +model_type = "roberta-base" +tokenizer = AutoTokenizer.from_pretrained(model_type) +model = AutoModel.from_pretrained(model_type) + +# new tokens +new_tokens = ["new_token"] + +# check if the tokens are already in the vocabulary +new_tokens = set(new_tokens) - set(tokenizer.vocab.keys()) + +# add the tokens to the tokenizer vocabulary +tokenizer.add_tokens(list(new_tokens)) + +# add new, random embeddings for the new tokens +model.resize_token_embeddings(len(tokenizer))