Skip to content

Commit

Permalink
feat: Add sample codes for tokenizer extending
Browse files Browse the repository at this point in the history
  • Loading branch information
YeonwooSung committed Aug 21, 2024
1 parent de314c0 commit 4295ba8
Show file tree
Hide file tree
Showing 6 changed files with 224 additions and 0 deletions.
20 changes: 20 additions & 0 deletions Experiments/huggingface/tokenizer_extending/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Extending existing AutoTokenizer with new tokens

Extending existing tokenizer with new tokens from custom dataset does not always work.
Also, once you extend the tokenizer, you really should fine-tune the model on the new dataset with the new tokens.

Basically, extending the tokenizer increases the size of the embedding matrix, which will add newly initialized vectors at the end.
Using these new embeddings untrained might already be useful, but usually at least some steps of fine-tuning are required.

## Codes

1. [Extending tokenizer with new tokens](./extend_tokenizer_with_new_words.py)
2. [Adding emojis to tokenizer](./add_emojis_to_tokenizer.py) : Basically, print emojis as unicode characters to file, and then read them back in as tokens.
3. [Adding multi-word expressions to tokenizer](./add_multiword_expressions_to_tokenizer.py)
4. [Extending existing AutoTokenizer with new bpe-tokenized tokens](./extend_tokenizer_with_new_bpe_tokens.py)
5. [Adding vocabs from one tokenizer to another](./adding_vocabs_from_tokenizer_to_another.py)

## References

* [How to add new tokens to huggingface transformers vocabulary](https://www.depends-on-the-definition.com/how-to-add-new-tokens-to-huggingface-transformers/)
* [Extending existing AutoTokenizer with new tokens](https://stackoverflow.com/a/76198053/9012940)
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import requests
from transformers import AutoTokenizer, AutoModel


response = requests.get('https://unicode.org/Public/emoji/13.0/emoji-sequences.txt')

with open('emoji.txt', 'w') as fout:
for line in response.content.decode('utf8').split('\n'):
if line.strip() and not line.startswith('#'):
hexa = line.split(';')[0]
hexa = hexa.split('..')
if len(hexa) == 1:
ch = ''.join([chr(int(h, 16)) for h in hexa[0].strip().split(' ')])
print(ch, end='\n', file=fout)
else:
start, end = hexa
for ch in range(int(start, 16), int(end, 16)+1):
#ch = ''.join([chr(int(h, 16)) for h in ch.split(' ')])
print(chr(ch), end='\n', file=fout)


# pick the model type
model_type = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModel.from_pretrained(model_type)

# add emojis
new_tokens = [e.strip() for e in open('emoji.txt')]

# check if the tokens are already in the vocabulary
new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())

# add the tokens to the tokenizer vocabulary
tokenizer.add_tokens(list(new_tokens))

# add new, random embeddings for the new tokens
model.resize_token_embeddings(len(tokenizer))
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel
from transformers.pipelines.token_classification import TokenClassificationPipeline
try:
import tensorflow as tf
except ImportError:
tf = None


model_checkpoint = "Davlan/bert-base-multilingual-cased-ner-hrl"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)


class TokenClassificationChunkPipeline(TokenClassificationPipeline):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)


def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
tokenizer_params = preprocess_params.pop("tokenizer_params", {})
truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
inputs = self.tokenizer(
sentence,
return_tensors="pt",
truncation=True,
return_special_tokens_mask=True,
return_offsets_mapping=True,
return_overflowing_tokens=True, # Return multiple chunks
max_length=self.tokenizer.model_max_length,
padding=True
)
#inputs.pop("overflow_to_sample_mapping", None)
num_chunks = len(inputs["input_ids"])

for i in range(num_chunks):
if self.framework == "tf":
if tf is None:
raise ImportError("Tensorflow is not available")
model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
else:
model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
if offset_mapping is not None:
model_inputs["offset_mapping"] = offset_mapping
model_inputs["sentence"] = sentence if i == 0 else None
model_inputs["is_last"] = i == num_chunks - 1
yield model_inputs


def _forward(self, model_inputs):
# Forward
special_tokens_mask = model_inputs.pop("special_tokens_mask")
offset_mapping = model_inputs.pop("offset_mapping", None)
sentence = model_inputs.pop("sentence")
is_last = model_inputs.pop("is_last")

overflow_to_sample_mapping = model_inputs.pop("overflow_to_sample_mapping")

output = self.model(**model_inputs)
logits = output["logits"] if isinstance(output, dict) else output[0]


model_outputs = {
"logits": logits,
"special_tokens_mask": special_tokens_mask,
"offset_mapping": offset_mapping,
"sentence": sentence,
"overflow_to_sample_mapping": overflow_to_sample_mapping,
"is_last": is_last,
**model_inputs,
}

# We reshape outputs to fit with the postprocess inputs
model_outputs["input_ids"] = torch.reshape(model_outputs["input_ids"], (1, -1))
model_outputs["token_type_ids"] = torch.reshape(model_outputs["token_type_ids"], (1, -1))
model_outputs["attention_mask"] = torch.reshape(model_outputs["attention_mask"], (1, -1))
model_outputs["special_tokens_mask"] = torch.reshape(model_outputs["special_tokens_mask"], (1, -1))
model_outputs["offset_mapping"] = torch.reshape(model_outputs["offset_mapping"], (1, -1, 2))

return model_outputs


pipe = TokenClassificationChunkPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple")

mwe_tokens = set(
token['word'] for sent in pipe(
["Bernard works at BNP Paribas in Paris.", "In New York, you will be a New Man"])
for token in sent
)


# pick the model type
model_type = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModel.from_pretrained(model_type)

# check if the tokens are already in the vocabulary
new_tokens = set(mwe_tokens) - set(tokenizer.vocab.keys())

# add the tokens to the tokenizer vocabulary
tokenizer.add_tokens(list(new_tokens))

# add new, random embeddings for the new tokens
model.resize_token_embeddings(len(tokenizer))
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from datasets import load_dataset

from transformers import AutoTokenizer, AutoModel

# pick the model type
tokenizer1 = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenizer2 = AutoTokenizer.from_pretrained("roberta-base")

print("Before adding roberta:", len(tokenizer1))

tokens_in_roberta_not_in_bert = set(tokenizer2.vocab).difference(tokenizer1.vocab)
tokenizer1.add_tokens(list(tokens_in_roberta_not_in_bert))

print("After adding roberta:", len(tokenizer1))


model = AutoModel.from_pretrained("bert-base-multilingual-cased")
model.resize_token_embeddings(len(tokenizer1))
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel


# pick the model type
model_type = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModel.from_pretrained(model_type)

# Original vocab size.
print(len(tokenizer))
# Note the outputs are 100s indices which points to unknown tokens.
print(tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ"))


# Take the first 1000 sentences from cc100.
am_dataset = load_dataset("cc100", "am")
am_train = iter(am_dataset['train'][i]['text'] for i in range(1000))

# Train a new tokenizer using the am_train and the old tokenizer object.
new_tokenizer = tokenizer.train_new_from_iterator(am_train, vocab_size=100_000)

tokenizer.add_tokens(list(new_tokenizer.vocab))

print(new_tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ"))
print(tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ"))
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from transformers import AutoTokenizer, AutoModel

# pick the model type
model_type = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModel.from_pretrained(model_type)

# new tokens
new_tokens = ["new_token"]

# check if the tokens are already in the vocabulary
new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())

# add the tokens to the tokenizer vocabulary
tokenizer.add_tokens(list(new_tokens))

# add new, random embeddings for the new tokens
model.resize_token_embeddings(len(tokenizer))

0 comments on commit 4295ba8

Please sign in to comment.