@@ -61,7 +61,7 @@ def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
61
61
62
62
def train (num_gpus , rank , group_name , output_directory , epochs , learning_rate ,
63
63
sigma , iters_per_checkpoint , batch_size , seed , fp16_run ,
64
- checkpoint_path ):
64
+ checkpoint_path , with_tensorboard ):
65
65
torch .manual_seed (seed )
66
66
torch .cuda .manual_seed (seed )
67
67
#=====START: ADDED FOR DISTRIBUTED======
@@ -107,6 +107,10 @@ def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
107
107
os .chmod (output_directory , 0o775 )
108
108
print ("output directory" , output_directory )
109
109
110
+ if with_tensorboard and rank == 0 :
111
+ from tensorboardX import SummaryWriter
112
+ logger = SummaryWriter (os .path .join (output_directory , 'logs' ))
113
+
110
114
model .train ()
111
115
epoch_offset = max (0 , int (iteration / len (train_loader )))
112
116
# ================ MAIN TRAINNIG LOOP! ===================
@@ -128,13 +132,15 @@ def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
128
132
129
133
if fp16_run :
130
134
with amp .scale_loss (loss , optimizer ) as scaled_loss :
131
- scaled_loss .backward ()
135
+ scaled_loss .backward ()
132
136
else :
133
137
loss .backward ()
134
138
135
139
optimizer .step ()
136
140
137
141
print ("{}:\t {:.9f}" .format (iteration , reduced_loss ))
142
+ if with_tensorboard and rank == 0 :
143
+ logger .add_scalar ('training_loss' , reduced_loss , i )
138
144
139
145
if (iteration % iters_per_checkpoint == 0 ):
140
146
if rank == 0 :
0 commit comments