forked from AnswerDotAI/cold-compress
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
83 lines (72 loc) · 3.09 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import torch
from dataclasses import dataclass
from transformers.tokenization_utils import PreTrainedTokenizer
from typing import List, Dict, Any
def get_max_length_in_nested_lists(lst):
if len(lst) and isinstance(lst[0], list):
lengths = []
for elem in lst:
length = get_max_length_in_nested_lists(elem)
lengths.append(length)
max_length = max(lengths)
return max_length
else:
return len(lst)
def pad_nested_lists(lst, max_length, padding_value, padding_side="right"):
if isinstance(lst, list) and len(lst) and isinstance(lst[0], list):
masks = []
for i, elem in enumerate(lst):
lst[i], mask = pad_nested_lists(elem, max_length, padding_value,
padding_side)
masks.append(mask)
return lst, masks
elif isinstance(lst, list):
if padding_side == "right":
mask = [1] * len(lst) + [0] * (max_length - len(lst))
lst = lst + [padding_value for _ in range(max_length - len(lst))]
return lst, mask
else:
mask = [0] * (max_length - len(lst)) + [1] * len(lst)
lst = [padding_value for _ in range(max_length - len(lst))] + lst
return lst, mask
else:
raise NotImplementedError(f"Unrecognized type {lst}")
@dataclass
class DefaultDataCollator:
"""
Data collator that can:
1. Dynamically pad all inputs received. The inputs must be dict of lists.
2. Add position_ids based on attention_mask if required.
"""
tokenizer: PreTrainedTokenizer
attention_padding_value: int = 0
label_padding_value: int = -100
keys_to_tensorize = {
"input_ids", "attention_mask", "labels", "position_ids",
"token_type_ids", "length", "depth", "index"
}
def __call__(self, batch_elem: List) -> Dict[str, Any]:
first_elem = batch_elem[0]
return_batch = {}
for key, value in first_elem.items():
# HACK: any key containing attention_mask must be attention_mask
# important to assign different pad token for different types of inputs
if "attention_mask" in key:
pad_token_id = self.attention_padding_value
elif "label" in key:
pad_token_id = self.label_padding_value
else:
pad_token_id = self.tokenizer.pad_token_id
batch_value = [elem[key] for elem in batch_elem]
# pad all lists and nested lists
if isinstance(value, list) and key in self.keys_to_tensorize:
max_length = get_max_length_in_nested_lists(batch_value)
batch_value, _ = pad_nested_lists(batch_value, max_length,
pad_token_id,
self.tokenizer.padding_side)
if key in self.keys_to_tensorize:
return_batch[key] = torch.tensor(batch_value)
else:
# handle strings and None
return_batch[key] = batch_value
return return_batch