forked from b0kch01/llama-cpu
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_LM.py
executable file
·507 lines (419 loc) · 18 KB
/
run_LM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
from typing import *
import os
import re
import sys
import gzip
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import fire
import time
import json
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from llama import ModelArgs, Transformer, Tokenizer, LLaMA
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO
)
# this marks the position of interest in our datasets
# sentences will be stripped of everything including
# and after this word. (we use these datasets also for
# MLM and seq2seq tasks where the following context is
# not stripped)
MASK_TOKEN: str = '<extra_id_0>'
class Timer():
'''
Prints a message about how long a block of code takes to run.
Usage:
with Timer(log_fn=...):
...
Result:
After code is run, runs log_fn with a string
"Completed in n seconds"
'''
def __init__(self, log_fn=print):
self.log_fn = log_fn
def __enter__(self):
self._start_time = time.time()
def __exit__(self, type, value, traceback):
self.log_fn(
f'Completed in {time.time() - self._start_time:.2f} seconds'
)
def load_tokenizer(
tokenizer_path: str,
) -> Tokenizer:
'''
Loads the LLaMA tokenizer.
We do this separately so we can tokenize
the dataset before loading the model, and thus
automatically adjust the maximum sequence length.
'''
logger.info('Creating tokenizer...')
tokenizer = Tokenizer(model_path=tokenizer_path)
return tokenizer
def load_llama(
ckpt_dir: str,
tokenizer: Tokenizer,
max_seq_len: int,
max_batch_size: int = 1,
) -> LLaMA:
'''
Loads and returns a LLaMA generator.
params:
ckpt_dir (str): the location of the directory containing the LLaMA checkpoint
tokenizer (Tokenizer): the tokenizer for the model
max_seq_len (int): the maximum sequence length that can be generated.
must be at least the length of the longest (tokenized) input sequence
max_batch_size (int): at most this many examples will be run in a batch
returns:
LLaMA: the LLaMA generator
'''
checkpoints = sorted(Path(ckpt_dir).glob('*.pth'))
with open(Path(ckpt_dir) / "params.json", "r") as f:
params = json.loads(f.read())
model_args = ModelArgs(
max_seq_len=max_seq_len,
max_batch_size=max_batch_size,
**params
)
model_args.vocab_size = tokenizer.n_words
logger.info('Creating transformer...')
model = Transformer(model_args)
# Original copyright by tloen
# https://github.com/tloen/llama-int8/blob/main/example.py
key_to_dim = {
"w1": 0,
"w2": -1,
"w3": 0,
"wo": -1,
"wq": 0,
"wk": 0,
"wv": 0,
"output": 0,
"tok_embeddings": -1,
"ffn_norm": None,
"attention_norm": None,
"norm": None,
"rope": None,
}
logger.info('Loading checkpoints to model...')
with Timer(logger.info):
for i, ckpt in tqdm(enumerate(checkpoints), total=len(checkpoints)):
checkpoint = torch.load(ckpt, map_location="cpu")
for parameter_name, parameter in model.named_parameters():
short_name = parameter_name.split(".")[-2]
if key_to_dim[short_name] is None and i == 0:
parameter.data = checkpoint[parameter_name]
elif key_to_dim[short_name] == 0:
size = checkpoint[parameter_name].size(0)
parameter.data[size * i: size * (i + 1), :] = checkpoint[
parameter_name
]
elif key_to_dim[short_name] == -1:
size = checkpoint[parameter_name].size(-1)
parameter.data[:, size * i: size * (i + 1)] = checkpoint[
parameter_name
]
del checkpoint[parameter_name]
del checkpoint
model.to("cpu")
generator = LLaMA(model, tokenizer)
setattr(generator, 'model_name_or_path', ckpt_dir)
return generator
def load_dataset(
dataset_path: str,
) -> List[str]:
'''
Loads a dataset from disk. This should be a txt.gz file with one
example to run per line. Positions of interest should be marked
with MASK_TOKEN (defined above). Because of how generation for
decoder-only models works, only one position of interest
can be defined per sentence.
'''
with gzip.open(dataset_path, 'rt') as in_file:
dataset = in_file.readlines()
# remove everything including + after the position of interest,
# since LLaMA models predict the next word
dataset = [example.split(MASK_TOKEN, 1)[0].strip() for example in dataset]
return dataset
def preprocess_dataset(
dataset: List[str],
tokenizer: Tokenizer
) -> torch.Tensor:
'''
Formats the dataset for use with a LLaMA model.
params:
dataset (Dataset) : a list of strings to use as prompts for the model
tokenizer (Tokenizer) : the tokenizer to use to prepare the examples for the model.
returns:
Dataset : the dataset formatted for use with a LLaMA model.
'''
def preprocess_function(example: str) -> torch.Tensor:
'''Tokenizes a string input.'''
model_inputs = tokenizer.encode(example, bos=True, eos=False)
return model_inputs
def pad_tensor(t: torch.Tensor, pad: int, dim: int = -1) -> torch.Tensor:
'''
Pads a tensor to length pad in dim dim.
From https://discuss.pytorch.org/t/dataloader-for-various-length-of-data/6418/8
params:
t (torch.Tensor): tensor to pad
pad (int) : the size to pad to
dim (int) : dimension to pad
returns:
a new torch.Tensor padded to 'pad' in dimension 'dim'
'''
pad_size = list(t.shape)
pad_size[dim] = pad - t.size(dim)
return torch.cat([
t,
torch.full(
size=pad_size,
fill_value=tokenizer.pad_id,
dtype=t.dtype,
device=t.device
)
], dim=dim)
dataset = [torch.tensor(preprocess_function(example)) for example in dataset]
# we increase the maximum sequence length by 1 so we can generate the next token
max_seq_len = max(len(example) for example in dataset) + 1
logger.info(f'Maximum sequence length in dataset is {max_seq_len - 1} tokens')
dataset = torch.stack([pad_tensor(t=t, pad=max_seq_len, dim=-1) for t in dataset])
return dataset
def evaluate_language_modeling(
generator: LLaMA,
dataset: torch.Tensor,
dataset_path: str,
output_dir: str,
max_batch_size: int = 1,
) -> None:
'''
Evaluates a LLaMA model on the language modeling task defined
by the dataset.
params:
generator (LLaMA): the LLaMA generator to evaluate
dataset (torch.Tensor): a dataset formatted for use with a LLaMA model.
this should be a tensor of shape (num_examples, max_seq_len)
dataset_path (str): the path to the dataset file. used to find the metadata for the dataset
the metadata contains information about which tokens to evaluate for
each example
output_dir (str): where to save the results of the evaluation
max_batch_size (int): the maximum batch size
'''
output_file = os.path.join(output_dir, f'{os.path.split(generator.model_name_or_path)[-1]}.lm_results.csv.gz')
if os.path.exists(output_file):
# return commented out for testing
# return
pass
with gzip.open(dataset_path.replace('.txt.gz', '_metadata.json.gz'), 'rt', encoding='utf-8') as in_file:
metadata = [json.loads(l) for l in in_file.readlines()]
os.makedirs(output_dir, exist_ok=True)
dataloader = DataLoader(dataset, batch_size=max_batch_size)
n_observed_examples = 0
metrics = []
for inputs in tqdm(dataloader):
n_examples_in_batch = inputs.shape[0]
# use this as a unique input identifier
input_nums = range(n_observed_examples, n_observed_examples + n_examples_in_batch)
n_observed_examples += n_examples_in_batch
batch_metadata = metadata[(n_observed_examples - n_examples_in_batch):n_observed_examples]
metrics.extend(evaluate_batch(
generator=generator,
inputs=inputs,
input_nums=input_nums,
batch_metadata=batch_metadata
))
metrics = pd.DataFrame(metrics)
metrics = metrics.assign(
model_name=re.sub('["\']', '', generator.model_name_or_path),
architecture='decoder-only',
n_params=f'{round(sum(p.numel() for p in generator.model.parameters() if p.requires_grad)/1000000000)}B',
test_dataset=os.path.basename(dataset_path).replace('.txt.gz', ''),
n_test_examples=len(dataset)
)
metrics.to_csv(output_file, index=False, na_rep='NaN')
def evaluate_batch(
generator: LLaMA,
inputs: torch.Tensor,
input_nums: List[int] = None,
batch_metadata: List[Dict] = None,
) -> List[Dict]:
'''
Evaluate a single batch of inputs on the eval tokens.
params:
generator (LLaMA): the LLaMA generator being evaluated
inputs (torch.Tensor): the batch of inputs to evaluate. shape: (batch_size, max_seq_len)
input_nums (List[int]): a list of unique numerical identifiers for the inputs
if not provided, range(len(inputs)) is used
batch_metadata (List[Dict]): for each example, the metadata for that example
must minimally contain a List of eval tokens for each example,
under the key `eval_tokens`
returns:
List[Dict]: a List of dictionaries, each of which contains the results
of the evaluation for the corresponding example
'''
if input_nums is None:
input_nums = range(len(inputs))
eval_tokens = [d.get("eval_tokens") for d in batch_metadata]
if not any(eval_tokens):
raise ValueError(f'No tokens were specified for evaluation.')
if len(eval_tokens) != len(inputs):
raise ValueError(
f'{len(eval_tokens)} sets of eval tokens were '
f'provided for {len(inputs)} sentences.'
)
eval_token_ids = get_eval_token_ids(
tokenizer=generator.tokenizer,
eval_tokens=eval_tokens
)
return evaluate_lm_batch(
generator=generator,
inputs=inputs,
input_nums=input_nums,
eval_tokens=eval_tokens,
eval_token_ids=eval_token_ids,
batch_metadata=batch_metadata,
)
def get_eval_token_ids(
tokenizer: Tokenizer,
eval_tokens: List[List[str]]
) -> List[List[int]]:
'''
Get the token ids for the eval tokens.
params:
tokenizer (Tokenizer): the tokenizer to use to encode the eval tokens
eval_tokens (List[List[str]]): for each example in a batch, a list of eval_tokens
to use to evaluate the corresponding example
returns:
List[List[int]]: the token ids of the eval tokens
raises:
ValueError via check_ids: if any tokens are not in the model vocabulary
'''
eval_token_ids = [[tokenizer.encode(t, bos=False, eos=False) for t in tokens] for tokens in eval_tokens]
# check that the eval tokens are single tokens
check_ids(eval_tokens=eval_tokens, eval_token_ids=eval_token_ids)
eval_token_ids = [[id for t in token_ids for id in t] for token_ids in eval_token_ids]
return eval_token_ids
def check_ids(
eval_tokens: List[List[str]],
eval_token_ids: List[List[int]],
) -> None:
'''
Verifies that all tokens in eval_token_ids are length 1.
This ensure that the tokens exist in the tokenizer's vocabulary.
params:
eval_tokens (List[List[str]]): the list of eval tokens as strings. Used
to make the ValueError message more useful
eval_token_ids (List[List[int]]): the encodings of the tokens by the tokenizer to check
raises:
ValueError: if any token_ids are longer than 1 token, raises a ValueError
because that token is not a word in the tokenizer's vocabulary
'''
for tokens, token_ids in zip(eval_tokens, eval_token_ids):
if (any(len(token_id) > 1 for token_id in token_ids)):
raise ValueError(
f'Some tokens used for evaluation are not tokenized as single words!:\n\n' +
"\n".join(
[str(t) for t in zip(tokens, token_ids) if len(t[-1]) > 1]
)
)
def evaluate_lm_batch(
generator: LLaMA,
inputs: torch.Tensor,
input_nums: List[int],
eval_tokens: List[List[str]],
eval_token_ids: List[List[int]],
batch_metadata: List[Dict],
) -> List[Dict]:
'''
Evaluates a batch of examples on a language modeling (=next word prediction) task.
params:
generator (LLaMA): the generator to run on the inputs
inputs (torch.Tensor): the tokenized batch to run next word predictions for
input_nums (List[int]): a list of numerical input identifiers
eval_tokens (List[List[str]]): for each example in the inputs, the tokens to extract
log probabilities for
eval_token_ids (List[List[int]]): for each example in the inputs: the encoded tokens
to extract log probabilities for
batch_metadata (List[Dict]): for each input, a dictionary containing metadata to add
to the results
returns:
List[Dict]: for each example_i * eval_tokens_example_i, a dictionary containing
the next word predicted by the generator for that example,
and the log probability predicted for that eval token
'''
with torch.no_grad():
batch_outputs = generator(tokens=inputs)
batch_scores = torch.stack([t[:generator.tokenizer.n_words] for t in batch_outputs])
batch_logprobs = F.log_softmax(batch_scores, dim=-1)
metrics = []
records = zip(input_nums, inputs, batch_outputs, eval_tokens, eval_token_ids, batch_logprobs, batch_metadata)
for input_num, input_seq, pred_token, tokens, token_ids, score, example_metadata in records:
metrics.extend(
[
{
'item': input_num,
'input_text': generator.tokenizer.decode(input_seq[torch.where((input_seq != generator.tokenizer.pad_id).nonzero(as_tuple=True)[0])].tolist()),
'pred_seq': generator.tokenizer.decode(torch.argmax(pred_token, dim=-1).tolist()), # we use "pred_seq" for consistency with other models
'token': token,
'token_id': token_id,
'logprob': score[token_id].item(),
**{k: v for k, v in example_metadata.items() if not k == 'eval_tokens'}
} for token, token_id in zip(tokens, token_ids)
]
)
return metrics
def main(
ckpt_dir: str,
tokenizer_path: str,
dataset_path: str,
max_batch_size: int,
) -> None:
'''
runs a language modeling task using a LLaMA generator
params:
ckpt_dir (str): the location of the LLaMA checkpoint (e.g., llama-checkpoints/7B)
tokenizer_path (str): the location of the LLaMA tokenizer (e.g., llama-checkpoints/tokenizer.model)
dataset_path (str): the location of the dataset to use for evaluation
datasets should consist of two files. one is a txt.gz file
that contains a single example per line, with the position
of interest marked with `<extra_id_0>`
the second is a file with the same name + `_metadata`, a .json.gz file
which for each line of the txt.gz file contains the metadata
for the corresponding example (minimally, a dictionary with key `eval_tokens`
containing a list of strings to extract log probabilities from the next word
predictions for the corresponding sentence
max_batch_size (int): how many examples to run at the same time
since LLaMA models are large, unless you have a lot of resources,
best to keep this small
Results are saved to disk in outputs/$dataset_name/$checkpoint_size.lm_results.csv.gz
'''
# we load the tokenizer and dataset first
# to automatically determine the maximum sequence length
dataset = load_dataset(dataset_path=dataset_path)
tokenizer = load_tokenizer(tokenizer_path=tokenizer_path)
dataset = preprocess_dataset(dataset=dataset, tokenizer=tokenizer)
max_seq_len = dataset.size()[-1]
generator = load_llama(
ckpt_dir=ckpt_dir,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_batch_size=max_batch_size,
)
output_dir = os.path.join('outputs', os.path.split(dataset_path)[-1].replace('.txt.gz', ''))
evaluate_language_modeling(
generator=generator,
dataset=dataset,
dataset_path=dataset_path,
output_dir=output_dir,
max_batch_size=max_batch_size,
)
if __name__ == '__main__':
fire.Fire(main)