-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtrain.py
145 lines (123 loc) · 6.49 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import argparse
import os
import torch
import wandb
from torch import nn
from torch.utils import data
from torch.utils.data import DistributedSampler
import torch.multiprocessing as mp
from Trainer import Trainer
from core.logger import InfoLogger, VisualWriter
import core.parser as Parser
import core.util as Util
from diffusion.gaussian_diffusion import GaussianDiffusion, get_beta_schedule
from core.parser import init_obj
from diffusion import gaussian_diffusion as gd
from mri_utils import ksp_to_viewable_image
def mae(input, target):
with torch.no_grad():
loss = nn.L1Loss()
output = loss(input, target)
return output
def define_network(logger, opt, network_opt):
""" define network with weights initialization """
net = init_obj(network_opt, logger)
if opt['phase'] == 'train':
logger.info('Network [{}] weights initialize using [{:s}] method.'.format(net.__class__.__name__,
network_opt['args'].get('init_type',
'default')))
return net
def main_worker(gpu, opt, wandb_run=''):
if 'local_rank' not in opt:
opt['local_rank'] = opt['global_rank'] = gpu
if opt['distributed']:
torch.cuda.set_device(int(opt['local_rank']))
print('using GPU {} for training'.format(int(opt['local_rank'])))
torch.distributed.init_process_group(backend='nccl',
init_method=opt['init_method'],
world_size=opt['world_size'],
rank=opt['global_rank'],
group_name='mtorch'
)
'''set seed and and cuDNN environment '''
torch.backends.cudnn.enabled = True
# warnings.warn('You have chosen to use cudnn for accleration. torch.backends.cudnn.enabled=True')
Util.set_seed(opt['seed'])
phase_logger = InfoLogger(opt)
phase_writer = VisualWriter(opt, phase_logger)
phase_logger.info('Create the log file in directory {}.\n'.format(opt['path']['experiments_root']))
# Load model:
model = define_network(phase_logger, opt, opt['model']['network'])
mean_type = (opt['model']['mean_type'] if 'mean_type' in opt['model'] else "eps")
mean_type = {"eps": gd.ModelMeanType.EPSILON, "x": gd.ModelMeanType.START_X}[mean_type]
diffusion = GaussianDiffusion(betas=get_beta_schedule(**opt['model']['diffusion']['beta_schedule']),
model_mean_type=mean_type,
model_var_type=gd.ModelVarType.FIXED_LARGE,
loss_type=gd.LossType.MSE)
train_dataset = init_obj(opt['datasets']['train']['which_dataset'], phase_logger, default_file_name='data.dataset', init_type='Dataset')
val_dataset = init_obj(opt['datasets']['validation']['which_dataset'], phase_logger, default_file_name='data.dataset', init_type='Dataset')
data_sampler = None
loader_opts = dict(**opt['datasets']['train']['dataloader']['args'])
val_loader_opts = dict(**opt['datasets']['validation']['dataloader']['args'])
if opt['distributed']:
data_sampler = DistributedSampler(train_dataset,
shuffle=opt['datasets']['train']['dataloader']['args']['shuffle'],
num_replicas=opt['world_size'],
rank=opt['global_rank'])
loader_opts["shuffle"] = False
train_loader = data.DataLoader(train_dataset, sampler=data_sampler, **loader_opts)
val_loader = data.DataLoader(val_dataset, **val_loader_opts)
base_change = opt['model']['base_change'] if 'base_change' in opt['model'] else None
base_change = {None: None, "mri": ksp_to_viewable_image}[base_change]
if gpu == 0 and wandb_run:
wandb_run = wandb.init(project="GSURE-Diffusion", entity=wandb_run, config={})
wandb_run.config.update(opt)
else:
wandb_run = None
print("Dataset size: ", len(train_dataset))
trainer = Trainer(
network=model,
diffusion=diffusion,
phase_loader=train_loader,
val_loader=val_loader,
metrics=[mae],
logger=phase_logger,
writer=phase_writer,
wandb_run=wandb_run,
sample_num=opt['model']['diffusion']['beta_schedule']["num_diffusion_timesteps"],
task="unconditional",
optimizers=opt['model']['trainer']['args']['optimizers'],
ema_scheduler=opt['model']['trainer']['args']['ema_scheduler'],
sigma_0=opt['model']['trainer']['args']['sigma_0'],
base_change=base_change,
model_wrapper=(opt['model']['model_wrapper'] if 'model_wrapper' in opt['model'] else False),
Lambda=(opt['model']['Lambda'] if 'Lambda' in opt['model'] else 1),
gsure=(opt['model']['gsure'] if 'gsure' in opt['model'] else True),
opt=opt
)
trainer.train()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', type=str, required=True, help='JSON file for configuration')
parser.add_argument('-b', '--batch', type=int, default=None, help='Batch size in every gpu')
parser.add_argument('-p', '--phase', type=str, choices=['train'], help='Run train or test', default='train')
parser.add_argument('-d', '--debug', action='store_true', help='Run script in debug setting')
parser.add_argument('-P', '--port', default='21012', type=str, help='Port setting for DDP')
parser.add_argument('-gpu', '--gpu_ids', type=str, default=None, help='GPU numbers to use for training')
parser.add_argument('--wandb', type=str, default='', help='W & B entity to use for wandb, leave empty for no W & B sync')
''' parser configs '''
args = parser.parse_args()
opt = Parser.parse(args)
''' cuda devices '''
gpu_str = ','.join(str(x) for x in opt['gpu_ids'])
os.environ['CUDA_VISIBLE_DEVICES'] = gpu_str
print('export CUDA_VISIBLE_DEVICES={}'.format(gpu_str))
''' use DistributedDataParallel(DDP) and multiprocessing for multi-gpu training'''
if opt['distributed']:
ngpus_per_node = len(opt['gpu_ids'])
opt['world_size'] = ngpus_per_node
opt['init_method'] = 'tcp://127.0.0.1:' + args.port
mp.spawn(main_worker, args=(ngpus_per_node, opt, args.wandb))
else:
opt['world_size'] = 1
main_worker(0, opt, args.wandb)