-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add sample codes for tokenizer extending
- Loading branch information
1 parent
de314c0
commit 4295ba8
Showing
6 changed files
with
224 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Extending existing AutoTokenizer with new tokens | ||
|
||
Extending existing tokenizer with new tokens from custom dataset does not always work. | ||
Also, once you extend the tokenizer, you really should fine-tune the model on the new dataset with the new tokens. | ||
|
||
Basically, extending the tokenizer increases the size of the embedding matrix, which will add newly initialized vectors at the end. | ||
Using these new embeddings untrained might already be useful, but usually at least some steps of fine-tuning are required. | ||
|
||
## Codes | ||
|
||
1. [Extending tokenizer with new tokens](./extend_tokenizer_with_new_words.py) | ||
2. [Adding emojis to tokenizer](./add_emojis_to_tokenizer.py) : Basically, print emojis as unicode characters to file, and then read them back in as tokens. | ||
3. [Adding multi-word expressions to tokenizer](./add_multiword_expressions_to_tokenizer.py) | ||
4. [Extending existing AutoTokenizer with new bpe-tokenized tokens](./extend_tokenizer_with_new_bpe_tokens.py) | ||
5. [Adding vocabs from one tokenizer to another](./adding_vocabs_from_tokenizer_to_another.py) | ||
|
||
## References | ||
|
||
* [How to add new tokens to huggingface transformers vocabulary](https://www.depends-on-the-definition.com/how-to-add-new-tokens-to-huggingface-transformers/) | ||
* [Extending existing AutoTokenizer with new tokens](https://stackoverflow.com/a/76198053/9012940) |
37 changes: 37 additions & 0 deletions
37
Experiments/huggingface/tokenizer_extending/add_emojis_to_tokenizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import requests | ||
from transformers import AutoTokenizer, AutoModel | ||
|
||
|
||
response = requests.get('https://unicode.org/Public/emoji/13.0/emoji-sequences.txt') | ||
|
||
with open('emoji.txt', 'w') as fout: | ||
for line in response.content.decode('utf8').split('\n'): | ||
if line.strip() and not line.startswith('#'): | ||
hexa = line.split(';')[0] | ||
hexa = hexa.split('..') | ||
if len(hexa) == 1: | ||
ch = ''.join([chr(int(h, 16)) for h in hexa[0].strip().split(' ')]) | ||
print(ch, end='\n', file=fout) | ||
else: | ||
start, end = hexa | ||
for ch in range(int(start, 16), int(end, 16)+1): | ||
#ch = ''.join([chr(int(h, 16)) for h in ch.split(' ')]) | ||
print(chr(ch), end='\n', file=fout) | ||
|
||
|
||
# pick the model type | ||
model_type = "bert-base-multilingual-cased" | ||
tokenizer = AutoTokenizer.from_pretrained(model_type) | ||
model = AutoModel.from_pretrained(model_type) | ||
|
||
# add emojis | ||
new_tokens = [e.strip() for e in open('emoji.txt')] | ||
|
||
# check if the tokens are already in the vocabulary | ||
new_tokens = set(new_tokens) - set(tokenizer.vocab.keys()) | ||
|
||
# add the tokens to the tokenizer vocabulary | ||
tokenizer.add_tokens(list(new_tokens)) | ||
|
||
# add new, random embeddings for the new tokens | ||
model.resize_token_embeddings(len(tokenizer)) |
105 changes: 105 additions & 0 deletions
105
Experiments/huggingface/tokenizer_extending/add_multiword_expressions_to_tokenizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import torch | ||
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel | ||
from transformers.pipelines.token_classification import TokenClassificationPipeline | ||
try: | ||
import tensorflow as tf | ||
except ImportError: | ||
tf = None | ||
|
||
|
||
model_checkpoint = "Davlan/bert-base-multilingual-cased-ner-hrl" | ||
|
||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | ||
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint) | ||
|
||
|
||
class TokenClassificationChunkPipeline(TokenClassificationPipeline): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
|
||
|
||
def preprocess(self, sentence, offset_mapping=None, **preprocess_params): | ||
tokenizer_params = preprocess_params.pop("tokenizer_params", {}) | ||
truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False | ||
inputs = self.tokenizer( | ||
sentence, | ||
return_tensors="pt", | ||
truncation=True, | ||
return_special_tokens_mask=True, | ||
return_offsets_mapping=True, | ||
return_overflowing_tokens=True, # Return multiple chunks | ||
max_length=self.tokenizer.model_max_length, | ||
padding=True | ||
) | ||
#inputs.pop("overflow_to_sample_mapping", None) | ||
num_chunks = len(inputs["input_ids"]) | ||
|
||
for i in range(num_chunks): | ||
if self.framework == "tf": | ||
if tf is None: | ||
raise ImportError("Tensorflow is not available") | ||
model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()} | ||
else: | ||
model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()} | ||
if offset_mapping is not None: | ||
model_inputs["offset_mapping"] = offset_mapping | ||
model_inputs["sentence"] = sentence if i == 0 else None | ||
model_inputs["is_last"] = i == num_chunks - 1 | ||
yield model_inputs | ||
|
||
|
||
def _forward(self, model_inputs): | ||
# Forward | ||
special_tokens_mask = model_inputs.pop("special_tokens_mask") | ||
offset_mapping = model_inputs.pop("offset_mapping", None) | ||
sentence = model_inputs.pop("sentence") | ||
is_last = model_inputs.pop("is_last") | ||
|
||
overflow_to_sample_mapping = model_inputs.pop("overflow_to_sample_mapping") | ||
|
||
output = self.model(**model_inputs) | ||
logits = output["logits"] if isinstance(output, dict) else output[0] | ||
|
||
|
||
model_outputs = { | ||
"logits": logits, | ||
"special_tokens_mask": special_tokens_mask, | ||
"offset_mapping": offset_mapping, | ||
"sentence": sentence, | ||
"overflow_to_sample_mapping": overflow_to_sample_mapping, | ||
"is_last": is_last, | ||
**model_inputs, | ||
} | ||
|
||
# We reshape outputs to fit with the postprocess inputs | ||
model_outputs["input_ids"] = torch.reshape(model_outputs["input_ids"], (1, -1)) | ||
model_outputs["token_type_ids"] = torch.reshape(model_outputs["token_type_ids"], (1, -1)) | ||
model_outputs["attention_mask"] = torch.reshape(model_outputs["attention_mask"], (1, -1)) | ||
model_outputs["special_tokens_mask"] = torch.reshape(model_outputs["special_tokens_mask"], (1, -1)) | ||
model_outputs["offset_mapping"] = torch.reshape(model_outputs["offset_mapping"], (1, -1, 2)) | ||
|
||
return model_outputs | ||
|
||
|
||
pipe = TokenClassificationChunkPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple") | ||
|
||
mwe_tokens = set( | ||
token['word'] for sent in pipe( | ||
["Bernard works at BNP Paribas in Paris.", "In New York, you will be a New Man"]) | ||
for token in sent | ||
) | ||
|
||
|
||
# pick the model type | ||
model_type = "bert-base-multilingual-cased" | ||
tokenizer = AutoTokenizer.from_pretrained(model_type) | ||
model = AutoModel.from_pretrained(model_type) | ||
|
||
# check if the tokens are already in the vocabulary | ||
new_tokens = set(mwe_tokens) - set(tokenizer.vocab.keys()) | ||
|
||
# add the tokens to the tokenizer vocabulary | ||
tokenizer.add_tokens(list(new_tokens)) | ||
|
||
# add new, random embeddings for the new tokens | ||
model.resize_token_embeddings(len(tokenizer)) |
18 changes: 18 additions & 0 deletions
18
Experiments/huggingface/tokenizer_extending/adding_vocabs_from_tokenizer_to_another.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from datasets import load_dataset | ||
|
||
from transformers import AutoTokenizer, AutoModel | ||
|
||
# pick the model type | ||
tokenizer1 = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") | ||
tokenizer2 = AutoTokenizer.from_pretrained("roberta-base") | ||
|
||
print("Before adding roberta:", len(tokenizer1)) | ||
|
||
tokens_in_roberta_not_in_bert = set(tokenizer2.vocab).difference(tokenizer1.vocab) | ||
tokenizer1.add_tokens(list(tokens_in_roberta_not_in_bert)) | ||
|
||
print("After adding roberta:", len(tokenizer1)) | ||
|
||
|
||
model = AutoModel.from_pretrained("bert-base-multilingual-cased") | ||
model.resize_token_embeddings(len(tokenizer1)) |
26 changes: 26 additions & 0 deletions
26
Experiments/huggingface/tokenizer_extending/extend_tokenizer_with_new_bpe_tokens.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from datasets import load_dataset | ||
from transformers import AutoTokenizer, AutoModel | ||
|
||
|
||
# pick the model type | ||
model_type = "bert-base-multilingual-cased" | ||
tokenizer = AutoTokenizer.from_pretrained(model_type) | ||
model = AutoModel.from_pretrained(model_type) | ||
|
||
# Original vocab size. | ||
print(len(tokenizer)) | ||
# Note the outputs are 100s indices which points to unknown tokens. | ||
print(tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ")) | ||
|
||
|
||
# Take the first 1000 sentences from cc100. | ||
am_dataset = load_dataset("cc100", "am") | ||
am_train = iter(am_dataset['train'][i]['text'] for i in range(1000)) | ||
|
||
# Train a new tokenizer using the am_train and the old tokenizer object. | ||
new_tokenizer = tokenizer.train_new_from_iterator(am_train, vocab_size=100_000) | ||
|
||
tokenizer.add_tokens(list(new_tokenizer.vocab)) | ||
|
||
print(new_tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ")) | ||
print(tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ")) |
18 changes: 18 additions & 0 deletions
18
Experiments/huggingface/tokenizer_extending/extend_tokenizer_with_new_words.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from transformers import AutoTokenizer, AutoModel | ||
|
||
# pick the model type | ||
model_type = "roberta-base" | ||
tokenizer = AutoTokenizer.from_pretrained(model_type) | ||
model = AutoModel.from_pretrained(model_type) | ||
|
||
# new tokens | ||
new_tokens = ["new_token"] | ||
|
||
# check if the tokens are already in the vocabulary | ||
new_tokens = set(new_tokens) - set(tokenizer.vocab.keys()) | ||
|
||
# add the tokens to the tokenizer vocabulary | ||
tokenizer.add_tokens(list(new_tokens)) | ||
|
||
# add new, random embeddings for the new tokens | ||
model.resize_token_embeddings(len(tokenizer)) |