Skip to content

Commit 4dc23ec

Browse files
committed
train.py: adding optional tensorboard
1 parent 9168aea commit 4dc23ec

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

train.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
6161

6262
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
6363
sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
64-
checkpoint_path):
64+
checkpoint_path, with_tensorboard):
6565
torch.manual_seed(seed)
6666
torch.cuda.manual_seed(seed)
6767
#=====START: ADDED FOR DISTRIBUTED======
@@ -107,6 +107,10 @@ def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
107107
os.chmod(output_directory, 0o775)
108108
print("output directory", output_directory)
109109

110+
if with_tensorboard and rank == 0:
111+
from tensorboardX import SummaryWriter
112+
logger = SummaryWriter(os.path.join(output_directory, 'logs'))
113+
110114
model.train()
111115
epoch_offset = max(0, int(iteration / len(train_loader)))
112116
# ================ MAIN TRAINNIG LOOP! ===================
@@ -128,13 +132,15 @@ def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
128132

129133
if fp16_run:
130134
with amp.scale_loss(loss, optimizer) as scaled_loss:
131-
scaled_loss.backward()
135+
scaled_loss.backward()
132136
else:
133137
loss.backward()
134138

135139
optimizer.step()
136140

137141
print("{}:\t{:.9f}".format(iteration, reduced_loss))
142+
if with_tensorboard and rank == 0:
143+
logger.add_scalar('training_loss', reduced_loss, i)
138144

139145
if (iteration % iters_per_checkpoint == 0):
140146
if rank == 0:

0 commit comments

Comments
 (0)