-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathloop.py
99 lines (71 loc) · 2.69 KB
/
loop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from transformers import LlamaForCausalLM, LlamaTokenizer
from torch.optim import AdamW
from accelerate import Accelerator
import time
from progress_bar import get_progress_bar
from get_data import load_data
import torch
accelerator = Accelerator()
####################
# CONSTANTS #
####################
IGNORE_TOKEN = -100
MODEL_PATH = "/mnt/models/llama2/hf/Llama-2-7b-hf"
NAME = "llama2-dolly-15k"
BATCH_SIZE = 8
NUM_EPOCHS = 3
####################
# TOKENIZER + DATA #
####################
tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH, legacy=False)
tokenizer.pad_token = tokenizer.eos_token
train_dataloader, eval_dataloader = load_data(tokenizer, BATCH_SIZE)
num_batches = len(train_dataloader)
####################
# MODEL #
####################
model = LlamaForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16)
model = accelerator.prepare(model)
####################
# OPTIMIZER + PREP #
####################
optimizer = AdamW(model.parameters(), lr=3e-5)
optimizer, train_dataloader, eval_dataloader = accelerator.prepare(optimizer, train_dataloader, eval_dataloader)
print("Prepared model to run with Accelerator")
# PROGRESS BAR
progress_bar = get_progress_bar(accelerator)
with progress_bar:
task = progress_bar.add_task(
"Training", total=num_batches*NUM_EPOCHS, loss=0.0, speed=0.0
)
loss_history = []
def rolling_average(window):
if len(loss_history) < window:
return sum(loss_history) / len(loss_history)
return sum(loss_history[-window:]) / window
for epoch in range(NUM_EPOCHS):
model.train() # set model to train mode
total_loss = 0
for idx, batch in enumerate(train_dataloader):
start = time.time()
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
# Forward pass
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss_history.append(loss.item())
# Backpropagation and optimization
optimizer.zero_grad()
accelerator.backward(loss)
optimizer.step()
progress_bar.update(
task, advance=1, loss=loss.item(), speed=time.time() - start
)
if idx % 50 == 0:
print(f"Avg. loss at epoch {epoch + idx/num_batches:.3f}: {rolling_average(50):.4f}")
torch.cuda.empty_cache()
# TODO: model evaluate (remember to unset model to train mode)
# save model
model.save_pretrained(f"/mnt/finetunes/{NAME}")
tokenizer.save_pretrained(f"/mnt/finetunes/{NAME}")