Skip to content

Commit

Permalink
fix no sents
Browse files Browse the repository at this point in the history
  • Loading branch information
markus583 committed Apr 30, 2024
1 parent 0c5cae4 commit b4b1704
Showing 1 changed file with 4 additions and 0 deletions.
4 changes: 4 additions & 0 deletions wtpsplit/tokenization_utils.py
Original file line number Diff line number Diff line change
@@ -21,6 +21,8 @@ def tokenize_and_get_labels(sentences, tokenizer, separator, lang_code):
return_offsets_mapping=True,
add_special_tokens=False,
truncation=False,
verbose=False,
padding=False,
)

tokens = tokenized_input.tokens()
@@ -57,6 +59,8 @@ def pack_sentences(examples, block_size, tokenizer, underflow_size=0, min_senten
# tokenization mapping gets problematic in such instances
sentences = [sentence.replace("\ufffd", "").strip() for sentence in sentences]
sentences = [sentence for sentence in sentences if len(sentence) > min_sentence_length]
if not sentences:
continue

# batch tokenize sentences
tokenized_sentences = tokenizer(sentences, add_special_tokens=False, verbose=False, padding=False)

0 comments on commit b4b1704

Please sign in to comment.