-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
executable file
·143 lines (110 loc) · 3.95 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import cProfile
import gc
import glob
import json
import os
import random
from dataclasses import asdict
import numpy as np
import torch
import torch.distributed as dist
import wandb
from config import TrainingConfig
OUTPUT_DIR = "/workspace/monorepo/output"
def get_latest_checkpoint_dir(run_name: str) -> str:
run_dir = get_run_dir(run_name)
paths = glob.glob(os.path.join(run_dir, "step-*"))
steps = [p.split("-")[-1] for p in paths]
if "final" in steps:
checkpoint_dir = os.path.join(run_dir, "step-final")
else:
step = max([int(s) for s in steps])
checkpoint_dir = os.path.join(run_dir, f"step-{step}")
return checkpoint_dir
def clear_mem():
gc.collect()
with torch.no_grad():
torch.cuda.empty_cache()
def print_parameters(model):
for name, _ in model.named_parameters():
if any(str(i) in name for i in range(1, 10)):
continue
if "0" in name:
print(name.replace("0", "%d"))
else:
print(name)
def python_profile_function(func):
def wrapper(*args, **kwargs):
profiler = cProfile.Profile()
profiler.enable()
result = func(*args, **kwargs)
profiler.disable()
profiler.dump_stats(f"{func.__name__}.prof")
return result
return wrapper
# from Stas Bekman https://github.com/stas00/ml-engineering/tree/master/reproducibility
def enforce_reproducibility(use_seed=None):
seed = use_seed if use_seed is not None else random.randint(1, 1000000)
random.seed(seed) # python RNG
np.random.seed(seed) # numpy RNG
# pytorch RNGs
torch.manual_seed(seed) # cpu + cuda
torch.cuda.manual_seed_all(seed) # multi-gpu - can be called without gpus
return seed
def seed_worker(worker_id):
worker_seed = torch.initial_seed() % 2**32
np.random.seed(worker_seed)
random.seed(worker_seed)
def get_all_reduce_mean(tensor):
if hasattr(dist.ReduceOp, "AVG"):
dist.all_reduce(tensor, op=dist.ReduceOp.AVG)
else:
dist.all_reduce(tensor, op=dist.reduce_op.SUM)
tensor = tensor / dist.get_world_size()
return tensor
# copied from artidoro/qlora
def print_trainable_parameters(model):
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || "
f"all params: {all_param} || "
f"trainable: {100 * trainable_params / all_param:2f}%"
)
def get_checkpoint_dir(step, run_name=None):
if run_name is None:
assert wandb.run is not None
run_name = wandb.run.name
return os.path.join(OUTPUT_DIR, f"{run_name}/step-{step}")
def get_run_dir(run_name=None):
if run_name is None:
assert wandb.run is not None
run_name = wandb.run.name
return os.path.join(OUTPUT_DIR, run_name)
def load_config(run_name: str) -> TrainingConfig:
with open(f"{OUTPUT_DIR}/{run_name}/config.json", "r") as f:
config = json.loads(f.read())
return TrainingConfig(**config)
def save_config(config: TrainingConfig, run_name: str):
run_dir = get_run_dir(run_name)
if not os.path.exists(run_dir):
os.makedirs(run_dir)
with open(os.path.join(run_dir, "config.json"), "w") as f:
f.write(json.dumps(asdict(config)))
def export_profile(prof, local_rank):
trace_path = f"traces/trace_{local_rank}.json"
timeline_path = f"timelines/memory_timeline_{local_rank}.html"
print(f"Exporting chrome trace to {trace_path}")
if not os.path.exists("traces"):
os.makedirs("traces")
prof.export_chrome_trace(f"traces/trace_{local_rank}.json")
print(f"Exporting memory timeline to {timeline_path}")
if not os.path.exists("timelines"):
os.makedirs("timelines")
prof.export_memory_timeline(
f"timelines/memory_timeline_{local_rank}.html", f"cuda:{local_rank}"
)