forked from XME-anonymous/XME
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinetuning_mned_mbert.txt
553 lines (550 loc) · 175 KB
/
finetuning_mned_mbert.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
[2023-03-25 14:47:21,305][__main__][INFO] -
alg: mend
lr: 1.0e-06
edit_lr: 0.0001
seed: 0
debug: false
model_save_pt: 5000
edit_bs: 1
silent: false
max_iters: 1000000
log_interval: 100
val_interval: 5000
lr_lr: 0.0001
batch_size: 2
val_batch_size: 5
accumulate_bs: 10
cedit: 0.1
cloc: 1.0
cbase: 1.0
val_steps: 500
device: cuda
base_loss: distill
oracle: false
train: true
train_base: false
opt: Adam
single_batch: false
archive: null
grad_clip: 100.0
ref: null
early_stop_patience: 20000
early_stop_key: loss/total_edit_val
dropout: 0.0
tokenizer: null
results_dir: null
no_grad_layers: null
eval_only: false
half: false
save: false
model:
pt: null
name: bert-base-multilingual-uncased
class_name: BertForSequenceClassification
tokenizer_class: BertTokenizer
tokenizer_name: bert-base-multilingual-uncased
inner_params:
- bert.encoder.layer.9.intermediate.dense.weight
- bert.encoder.layer.9.output.dense.weight
- bert.encoder.layer.10.intermediate.dense.weight
- bert.encoder.layer.10.output.dense.weight
- bert.encoder.layer.11.intermediate.dense.weight
- bert.encoder.layer.11.output.dense.weight
data:
path: null
rephrase: true
zsre_nq: true
nq_path: ${hydra:runtime.cwd}/data/nq
wiki_webtext: true
n_edits: 1
eval:
verbose: true
log_interval: 100
final_eval: true
mend:
one_sided: false
n_hidden: 1
hidden_dim: null
init: id
norm: true
combine: true
x_only: false
delta_only: false
act: relu
rank: 1920
mlp_class: IDMLP
shared: true
task: fc
dataset: fever
train_set: fever/fever_train_1200 - spanish_1200.jsonl
val_set: fever/fever_dev_1200 - spanish_1200.jsonl
tests: false
[2023-03-25 14:47:21,305][__main__][INFO] - Project base directory: /home/anonymous-xme/mend/mend
[2023-03-25 14:47:21,344][models][INFO] - Loading model class <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'> with name bert-base-multilingual-uncased from cache dir /home/anonymous-xme/mend/mend/cache/
[2023-03-25 14:47:23,980][models][INFO] - Set 38 dropout modules to p=0.0
Data Size: 104422
Data Size: 10364
[2023-03-25 14:47:27,281][__main__][INFO] - Loading class MEND from module <module 'algs.mend' from '/home/anonymous-xme/mend/mend/algs/mend.py'>
[2023-03-25 14:47:27,282][algs.mend][INFO] - Hooked 6 modules
========== 768 3072
========== 3
[2023-03-25 14:47:27,283][algs.mend][INFO] - Building Gradient Transform with MLP class <class 'nn.IDMLP'>
[2023-03-25 14:47:27,283][nn][INFO] - Building IDMLP (id) [3840, 3840, 3840]
========== 3072 768
========== 3
[2023-03-25 14:47:27,384][algs.mend][INFO] - Building Gradient Transform with MLP class <class 'nn.IDMLP'>
[2023-03-25 14:47:27,384][nn][INFO] - Building IDMLP (id) [3840, 3840, 3840]
[2023-03-25 14:47:31,316][trainer][INFO] - Building optimizer <class 'torch.optim.adam.Adam'> with lr 1e-06
[2023-03-25 14:47:31,318][trainer][INFO] - Writing wandb run "fever - mend - bert-base-multilingual-uncased - 2023-03-25_14-47-21_2742039763" to /tmp/tmp98_od87f
[2023-03-25 14:47:34,427][trainer][INFO] - Step 0:
[2023-03-25 14:47:34,428][trainer][INFO] - loss/edit_train: 1.61134; loss/loc_train: 0.05951; edit/acc_train: 0.00000; edit/log_prob_train: -1.61134; edit/prob_train: 0.19962; acc/pre_train: 0.00000; acc/post_train: 1.00000; nll/pre_train: 0.73266; perplexity/pre_train: 2.08060; nll/post_train: 0.28587; perplexity/post_train: 1.33092; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.10415; loss/total_train: 0.22064; loss/total_edit_train: 0.22064; memory/alloc_max_train: 2791523840.00000; memory/res_max_train: 3131047936.00000
[2023-03-25 14:48:27,546][trainer][INFO] - Step 0:
[2023-03-25 14:48:27,547][trainer][INFO] - loss/edit_val: 1.49436; loss/loc_val: 0.06437; edit/acc_val: 0.00600; edit/log_prob_val: -1.49436; edit/prob_val: 0.23247; acc/pre_val: 0.47600; acc/post_val: 0.50200; nll/pre_val: 0.69428; perplexity/pre_val: 2.00227; nll/post_val: 0.85586; perplexity/post_val: 2.35339; n_tokens/pre_val: 4.00000; n_tokens/post_val: 4.00000; time/edit_val: 0.07004; loss/total_val: 0.21381; loss/total_edit_val: 0.21381; memory/alloc_max_val: 2936076791.80800; memory/res_max_val: 3463001931.77600; eval_time/elapsed: 53.09354; eval_time/average: 0.10619
[2023-03-25 14:48:27,551][trainer][INFO] - Saving model to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-47-21_2742039763/models/bert-base-multilingual-uncased.2023-03-25_14-47-21_2742039763
[2023-03-25 14:48:27,914][trainer][INFO] - Write complete.
[2023-03-25 14:48:41,139][trainer][INFO] - Step 100:
[2023-03-25 14:48:41,139][trainer][INFO] - loss/edit_train: 1.20901; loss/loc_train: 0.01794; edit/acc_train: 0.02000; edit/log_prob_train: -1.20901; edit/prob_train: 0.30815; acc/pre_train: 0.45000; acc/post_train: 0.51000; nll/pre_train: 0.70310; perplexity/pre_train: 2.02001; nll/post_train: 0.73340; perplexity/post_train: 2.08215; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07348; loss/total_train: 0.13884; loss/total_edit_train: 0.13884; memory/alloc_max_train: 3298729753.60000; memory/res_max_train: 3771308441.60000; grad_train: 175.34323; lr/lr0_train: 0.00009; lr/lr1_train: 0.00009; lr/lr2_train: 0.00009; lr/lr3_train: 0.00009; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 14:48:54,100][trainer][INFO] - Step 200:
[2023-03-25 14:48:54,100][trainer][INFO] - loss/edit_train: 1.04723; loss/loc_train: 0.00446; edit/acc_train: 0.03000; edit/log_prob_train: -1.04723; edit/prob_train: 0.35954; acc/pre_train: 0.47000; acc/post_train: 0.46000; nll/pre_train: 0.68809; perplexity/pre_train: 1.98992; nll/post_train: 0.73142; perplexity/post_train: 2.07803; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07272; loss/total_train: 0.10918; loss/total_edit_train: 0.10918; memory/alloc_max_train: 3340224921.60000; memory/res_max_train: 3804233728.00000; grad_train: 94.76478; lr/lr0_train: 0.00008; lr/lr1_train: 0.00009; lr/lr2_train: 0.00008; lr/lr3_train: 0.00008; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 14:49:07,937][trainer][INFO] - Step 300:
[2023-03-25 14:49:07,938][trainer][INFO] - loss/edit_train: 0.86042; loss/loc_train: 0.00315; edit/acc_train: 0.15000; edit/log_prob_train: -0.86042; edit/prob_train: 0.42942; acc/pre_train: 0.43000; acc/post_train: 0.27000; nll/pre_train: 0.70086; perplexity/pre_train: 2.01549; nll/post_train: 0.75736; perplexity/post_train: 2.13264; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07851; loss/total_train: 0.08919; loss/total_edit_train: 0.08919; memory/alloc_max_train: 3342214272.00000; memory/res_max_train: 3804233728.00000; grad_train: 62.21025; lr/lr0_train: 0.00008; lr/lr1_train: 0.00008; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 14:49:21,555][trainer][INFO] - Step 400:
[2023-03-25 14:49:21,556][trainer][INFO] - loss/edit_train: 0.72406; loss/loc_train: 0.00281; edit/acc_train: 0.40000; edit/log_prob_train: -0.72406; edit/prob_train: 0.48941; acc/pre_train: 0.35000; acc/post_train: 0.26000; nll/pre_train: 0.70402; perplexity/pre_train: 2.02186; nll/post_train: 0.74063; perplexity/post_train: 2.09726; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07719; loss/total_train: 0.07522; loss/total_edit_train: 0.07522; memory/alloc_max_train: 3346959969.28000; memory/res_max_train: 3807987630.08000; grad_train: 49.33402; lr/lr0_train: 0.00007; lr/lr1_train: 0.00008; lr/lr2_train: 0.00007; lr/lr3_train: 0.00007; lr/lr4_train: 0.00007; lr/lr5_train: 0.00007
[2023-03-25 14:49:34,840][trainer][INFO] - Step 500:
[2023-03-25 14:49:34,841][trainer][INFO] - loss/edit_train: 0.66922; loss/loc_train: 0.00350; edit/acc_train: 0.58000; edit/log_prob_train: -0.66922; edit/prob_train: 0.51677; acc/pre_train: 0.34000; acc/post_train: 0.49000; nll/pre_train: 0.70766; perplexity/pre_train: 2.02924; nll/post_train: 0.71190; perplexity/post_train: 2.03786; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07428; loss/total_train: 0.07043; loss/total_edit_train: 0.07043; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 35.64952; lr/lr0_train: 0.00007; lr/lr1_train: 0.00008; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00006; lr/lr5_train: 0.00006
[2023-03-25 14:49:47,653][trainer][INFO] - Step 600:
[2023-03-25 14:49:47,653][trainer][INFO] - loss/edit_train: 0.57696; loss/loc_train: 0.00576; edit/acc_train: 0.86000; edit/log_prob_train: -0.57696; edit/prob_train: 0.56546; acc/pre_train: 0.44000; acc/post_train: 0.55000; nll/pre_train: 0.69399; perplexity/pre_train: 2.00168; nll/post_train: 0.70313; perplexity/post_train: 2.02006; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07193; loss/total_train: 0.06345; loss/total_edit_train: 0.06345; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 84.08903; lr/lr0_train: 0.00008; lr/lr1_train: 0.00008; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00006; lr/lr5_train: 0.00006
[2023-03-25 14:50:01,514][trainer][INFO] - Step 700:
[2023-03-25 14:50:01,515][trainer][INFO] - loss/edit_train: 0.52118; loss/loc_train: 0.01125; edit/acc_train: 0.93000; edit/log_prob_train: -0.52118; edit/prob_train: 0.59937; acc/pre_train: 0.46000; acc/post_train: 0.50000; nll/pre_train: 0.69682; perplexity/pre_train: 2.00735; nll/post_train: 0.72867; perplexity/post_train: 2.07232; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07693; loss/total_train: 0.06337; loss/total_edit_train: 0.06337; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 87.08012; lr/lr0_train: 0.00008; lr/lr1_train: 0.00008; lr/lr2_train: 0.00005; lr/lr3_train: 0.00007; lr/lr4_train: 0.00006; lr/lr5_train: 0.00006
[2023-03-25 14:50:14,838][trainer][INFO] - Step 800:
[2023-03-25 14:50:14,838][trainer][INFO] - loss/edit_train: 0.50843; loss/loc_train: 0.00927; edit/acc_train: 0.94000; edit/log_prob_train: -0.50843; edit/prob_train: 0.60634; acc/pre_train: 0.39000; acc/post_train: 0.56000; nll/pre_train: 0.71805; perplexity/pre_train: 2.05044; nll/post_train: 0.69288; perplexity/post_train: 1.99947; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07415; loss/total_train: 0.06012; loss/total_edit_train: 0.06012; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 35.50285; lr/lr0_train: 0.00008; lr/lr1_train: 0.00008; lr/lr2_train: 0.00005; lr/lr3_train: 0.00007; lr/lr4_train: 0.00006; lr/lr5_train: 0.00006
[2023-03-25 14:50:28,127][trainer][INFO] - Step 900:
[2023-03-25 14:50:28,127][trainer][INFO] - loss/edit_train: 0.48813; loss/loc_train: 0.00971; edit/acc_train: 0.95000; edit/log_prob_train: -0.48813; edit/prob_train: 0.61935; acc/pre_train: 0.36000; acc/post_train: 0.45000; nll/pre_train: 0.72234; perplexity/pre_train: 2.05925; nll/post_train: 0.76846; perplexity/post_train: 2.15645; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07536; loss/total_train: 0.05853; loss/total_edit_train: 0.05853; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 118.73105; lr/lr0_train: 0.00008; lr/lr1_train: 0.00008; lr/lr2_train: 0.00005; lr/lr3_train: 0.00007; lr/lr4_train: 0.00006; lr/lr5_train: 0.00006
[2023-03-25 14:50:41,068][trainer][INFO] - Step 1000:
[2023-03-25 14:50:41,068][trainer][INFO] - loss/edit_train: 0.45082; loss/loc_train: 0.01088; edit/acc_train: 1.00000; edit/log_prob_train: -0.45082; edit/prob_train: 0.63853; acc/pre_train: 0.51000; acc/post_train: 0.55000; nll/pre_train: 0.68991; perplexity/pre_train: 1.99353; nll/post_train: 0.70957; perplexity/post_train: 2.03311; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07249; loss/total_train: 0.05596; loss/total_edit_train: 0.05596; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 49.53888; lr/lr0_train: 0.00009; lr/lr1_train: 0.00008; lr/lr2_train: 0.00005; lr/lr3_train: 0.00006; lr/lr4_train: 0.00006; lr/lr5_train: 0.00006
[2023-03-25 14:50:53,905][trainer][INFO] - Step 1100:
[2023-03-25 14:50:53,906][trainer][INFO] - loss/edit_train: 0.47359; loss/loc_train: 0.01014; edit/acc_train: 0.99000; edit/log_prob_train: -0.47359; edit/prob_train: 0.62420; acc/pre_train: 0.51000; acc/post_train: 0.44000; nll/pre_train: 0.69201; perplexity/pre_train: 1.99774; nll/post_train: 0.76217; perplexity/post_train: 2.14292; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07162; loss/total_train: 0.05750; loss/total_edit_train: 0.05750; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 32.29405; lr/lr0_train: 0.00009; lr/lr1_train: 0.00007; lr/lr2_train: 0.00005; lr/lr3_train: 0.00006; lr/lr4_train: 0.00006; lr/lr5_train: 0.00006
[2023-03-25 14:51:06,953][trainer][INFO] - Step 1200:
[2023-03-25 14:51:06,954][trainer][INFO] - loss/edit_train: 0.45544; loss/loc_train: 0.01021; edit/acc_train: 0.99000; edit/log_prob_train: -0.45544; edit/prob_train: 0.63543; acc/pre_train: 0.44000; acc/post_train: 0.51000; nll/pre_train: 0.68903; perplexity/pre_train: 1.99178; nll/post_train: 0.70807; perplexity/post_train: 2.03006; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07340; loss/total_train: 0.05576; loss/total_edit_train: 0.05576; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 57.15506; lr/lr0_train: 0.00009; lr/lr1_train: 0.00007; lr/lr2_train: 0.00005; lr/lr3_train: 0.00006; lr/lr4_train: 0.00006; lr/lr5_train: 0.00006
[2023-03-25 14:51:20,086][trainer][INFO] - Step 1300:
[2023-03-25 14:51:20,086][trainer][INFO] - loss/edit_train: 0.43264; loss/loc_train: 0.01106; edit/acc_train: 0.99000; edit/log_prob_train: -0.43264; edit/prob_train: 0.65205; acc/pre_train: 0.38000; acc/post_train: 0.52000; nll/pre_train: 0.70755; perplexity/pre_train: 2.02902; nll/post_train: 0.71747; perplexity/post_train: 2.04924; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07425; loss/total_train: 0.05433; loss/total_edit_train: 0.05433; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 51.91022; lr/lr0_train: 0.00009; lr/lr1_train: 0.00007; lr/lr2_train: 0.00005; lr/lr3_train: 0.00006; lr/lr4_train: 0.00006; lr/lr5_train: 0.00006
[2023-03-25 14:51:32,961][trainer][INFO] - Step 1400:
[2023-03-25 14:51:32,962][trainer][INFO] - loss/edit_train: 0.40359; loss/loc_train: 0.01104; edit/acc_train: 1.00000; edit/log_prob_train: -0.40359; edit/prob_train: 0.66906; acc/pre_train: 0.43000; acc/post_train: 0.53000; nll/pre_train: 0.70381; perplexity/pre_train: 2.02144; nll/post_train: 0.70839; perplexity/post_train: 2.03072; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07144; loss/total_train: 0.05140; loss/total_edit_train: 0.05140; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 40.42885; lr/lr0_train: 0.00009; lr/lr1_train: 0.00007; lr/lr2_train: 0.00005; lr/lr3_train: 0.00006; lr/lr4_train: 0.00006; lr/lr5_train: 0.00006
[2023-03-25 14:51:46,592][trainer][INFO] - Step 1500:
[2023-03-25 14:51:46,593][trainer][INFO] - loss/edit_train: 0.39512; loss/loc_train: 0.00991; edit/acc_train: 1.00000; edit/log_prob_train: -0.39512; edit/prob_train: 0.67577; acc/pre_train: 0.48000; acc/post_train: 0.54000; nll/pre_train: 0.69275; perplexity/pre_train: 1.99920; nll/post_train: 0.71222; perplexity/post_train: 2.03851; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07788; loss/total_train: 0.04943; loss/total_edit_train: 0.04943; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 55.53510; lr/lr0_train: 0.00009; lr/lr1_train: 0.00007; lr/lr2_train: 0.00005; lr/lr3_train: 0.00006; lr/lr4_train: 0.00007; lr/lr5_train: 0.00006
[2023-03-25 14:51:59,671][trainer][INFO] - Step 1600:
[2023-03-25 14:51:59,671][trainer][INFO] - loss/edit_train: 0.38569; loss/loc_train: 0.01066; edit/acc_train: 0.98000; edit/log_prob_train: -0.38569; edit/prob_train: 0.68441; acc/pre_train: 0.42000; acc/post_train: 0.53000; nll/pre_train: 0.71515; perplexity/pre_train: 2.04449; nll/post_train: 0.71959; perplexity/post_train: 2.05360; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07396; loss/total_train: 0.04923; loss/total_edit_train: 0.04923; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 88.27183; lr/lr0_train: 0.00009; lr/lr1_train: 0.00007; lr/lr2_train: 0.00005; lr/lr3_train: 0.00005; lr/lr4_train: 0.00008; lr/lr5_train: 0.00006
[2023-03-25 14:52:12,864][trainer][INFO] - Step 1700:
[2023-03-25 14:52:12,865][trainer][INFO] - loss/edit_train: 0.38597; loss/loc_train: 0.00928; edit/acc_train: 0.99000; edit/log_prob_train: -0.38597; edit/prob_train: 0.68317; acc/pre_train: 0.42000; acc/post_train: 0.51000; nll/pre_train: 0.70573; perplexity/pre_train: 2.02533; nll/post_train: 0.71671; perplexity/post_train: 2.04768; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07442; loss/total_train: 0.04787; loss/total_edit_train: 0.04787; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 89.58208; lr/lr0_train: 0.00009; lr/lr1_train: 0.00007; lr/lr2_train: 0.00005; lr/lr3_train: 0.00005; lr/lr4_train: 0.00008; lr/lr5_train: 0.00006
[2023-03-25 14:52:26,611][trainer][INFO] - Step 1800:
[2023-03-25 14:52:26,612][trainer][INFO] - loss/edit_train: 0.35583; loss/loc_train: 0.01125; edit/acc_train: 1.00000; edit/log_prob_train: -0.35583; edit/prob_train: 0.70373; acc/pre_train: 0.43000; acc/post_train: 0.48000; nll/pre_train: 0.69337; perplexity/pre_train: 2.00045; nll/post_train: 0.75227; perplexity/post_train: 2.12180; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07776; loss/total_train: 0.04684; loss/total_edit_train: 0.04684; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 69.49263; lr/lr0_train: 0.00009; lr/lr1_train: 0.00007; lr/lr2_train: 0.00005; lr/lr3_train: 0.00005; lr/lr4_train: 0.00009; lr/lr5_train: 0.00006
[2023-03-25 14:52:40,580][trainer][INFO] - Step 1900:
[2023-03-25 14:52:40,580][trainer][INFO] - loss/edit_train: 0.35589; loss/loc_train: 0.01076; edit/acc_train: 1.00000; edit/log_prob_train: -0.35589; edit/prob_train: 0.70387; acc/pre_train: 0.46000; acc/post_train: 0.62000; nll/pre_train: 0.69180; perplexity/pre_train: 1.99731; nll/post_train: 0.64851; perplexity/post_train: 1.91269; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.07777; loss/total_train: 0.04635; loss/total_edit_train: 0.04635; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 90.16727; lr/lr0_train: 0.00009; lr/lr1_train: 0.00007; lr/lr2_train: 0.00005; lr/lr3_train: 0.00005; lr/lr4_train: 0.00010; lr/lr5_train: 0.00006
[2023-03-25 14:52:56,260][trainer][INFO] - Step 2000:
[2023-03-25 14:52:56,261][trainer][INFO] - loss/edit_train: 0.33711; loss/loc_train: 0.01194; edit/acc_train: 1.00000; edit/log_prob_train: -0.33711; edit/prob_train: 0.71761; acc/pre_train: 0.39000; acc/post_train: 0.48000; nll/pre_train: 0.69350; perplexity/pre_train: 2.00070; nll/post_train: 0.75417; perplexity/post_train: 2.12584; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08749; loss/total_train: 0.04565; loss/total_edit_train: 0.04565; memory/alloc_max_train: 3356403712.00000; memory/res_max_train: 3818913792.00000; grad_train: 131.00184; lr/lr0_train: 0.00009; lr/lr1_train: 0.00006; lr/lr2_train: 0.00005; lr/lr3_train: 0.00005; lr/lr4_train: 0.00011; lr/lr5_train: 0.00007
[2023-03-25 14:53:11,391][trainer][INFO] - Step 2100:
[2023-03-25 14:53:11,392][trainer][INFO] - loss/edit_train: 0.37307; loss/loc_train: 0.00778; edit/acc_train: 0.99000; edit/log_prob_train: -0.37307; edit/prob_train: 0.69314; acc/pre_train: 0.47000; acc/post_train: 0.58000; nll/pre_train: 0.67966; perplexity/pre_train: 1.97321; nll/post_train: 0.66909; perplexity/post_train: 1.95246; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08416; loss/total_train: 0.04508; loss/total_edit_train: 0.04508; memory/alloc_max_train: 3356404316.16000; memory/res_max_train: 3818913792.00000; grad_train: 128.91123; lr/lr0_train: 0.00008; lr/lr1_train: 0.00006; lr/lr2_train: 0.00005; lr/lr3_train: 0.00005; lr/lr4_train: 0.00012; lr/lr5_train: 0.00007
[2023-03-25 14:53:26,234][trainer][INFO] - Step 2200:
[2023-03-25 14:53:26,235][trainer][INFO] - loss/edit_train: 0.28698; loss/loc_train: 0.01281; edit/acc_train: 1.00000; edit/log_prob_train: -0.28698; edit/prob_train: 0.75270; acc/pre_train: 0.50000; acc/post_train: 0.56000; nll/pre_train: 0.68933; perplexity/pre_train: 1.99237; nll/post_train: 0.72531; perplexity/post_train: 2.06537; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08236; loss/total_train: 0.04151; loss/total_edit_train: 0.04151; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 74.22501; lr/lr0_train: 0.00008; lr/lr1_train: 0.00006; lr/lr2_train: 0.00005; lr/lr3_train: 0.00004; lr/lr4_train: 0.00012; lr/lr5_train: 0.00007
[2023-03-25 14:53:42,137][trainer][INFO] - Step 2300:
[2023-03-25 14:53:42,137][trainer][INFO] - loss/edit_train: 0.30600; loss/loc_train: 0.00931; edit/acc_train: 1.00000; edit/log_prob_train: -0.30600; edit/prob_train: 0.73958; acc/pre_train: 0.56000; acc/post_train: 0.57000; nll/pre_train: 0.68177; perplexity/pre_train: 1.97738; nll/post_train: 0.69976; perplexity/post_train: 2.01327; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08859; loss/total_train: 0.03991; loss/total_edit_train: 0.03991; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 82.26633; lr/lr0_train: 0.00009; lr/lr1_train: 0.00006; lr/lr2_train: 0.00006; lr/lr3_train: 0.00004; lr/lr4_train: 0.00013; lr/lr5_train: 0.00008
[2023-03-25 14:53:58,074][trainer][INFO] - Step 2400:
[2023-03-25 14:53:58,075][trainer][INFO] - loss/edit_train: 0.26780; loss/loc_train: 0.01065; edit/acc_train: 0.99000; edit/log_prob_train: -0.26780; edit/prob_train: 0.76950; acc/pre_train: 0.45000; acc/post_train: 0.49000; nll/pre_train: 0.70394; perplexity/pre_train: 2.02169; nll/post_train: 0.75020; perplexity/post_train: 2.11743; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08686; loss/total_train: 0.03743; loss/total_edit_train: 0.03743; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 122.07277; lr/lr0_train: 0.00008; lr/lr1_train: 0.00007; lr/lr2_train: 0.00006; lr/lr3_train: 0.00003; lr/lr4_train: 0.00014; lr/lr5_train: 0.00008
[2023-03-25 14:54:13,787][trainer][INFO] - Step 2500:
[2023-03-25 14:54:13,788][trainer][INFO] - loss/edit_train: 0.29172; loss/loc_train: 0.01023; edit/acc_train: 0.99000; edit/log_prob_train: -0.29172; edit/prob_train: 0.75182; acc/pre_train: 0.50000; acc/post_train: 0.49000; nll/pre_train: 0.69956; perplexity/pre_train: 2.01286; nll/post_train: 0.74345; perplexity/post_train: 2.10318; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08768; loss/total_train: 0.03940; loss/total_edit_train: 0.03940; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 110.95207; lr/lr0_train: 0.00008; lr/lr1_train: 0.00007; lr/lr2_train: 0.00006; lr/lr3_train: 0.00003; lr/lr4_train: 0.00014; lr/lr5_train: 0.00008
[2023-03-25 14:54:29,660][trainer][INFO] - Step 2600:
[2023-03-25 14:54:29,660][trainer][INFO] - loss/edit_train: 0.26395; loss/loc_train: 0.01314; edit/acc_train: 1.00000; edit/log_prob_train: -0.26395; edit/prob_train: 0.77167; acc/pre_train: 0.37000; acc/post_train: 0.50000; nll/pre_train: 0.70167; perplexity/pre_train: 2.01712; nll/post_train: 0.76337; perplexity/post_train: 2.14550; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08608; loss/total_train: 0.03953; loss/total_edit_train: 0.03953; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 167.71666; lr/lr0_train: 0.00009; lr/lr1_train: 0.00006; lr/lr2_train: 0.00006; lr/lr3_train: 0.00003; lr/lr4_train: 0.00014; lr/lr5_train: 0.00008
[2023-03-25 14:54:45,928][trainer][INFO] - Step 2700:
[2023-03-25 14:54:45,929][trainer][INFO] - loss/edit_train: 0.29041; loss/loc_train: 0.00747; edit/acc_train: 0.98000; edit/log_prob_train: -0.29041; edit/prob_train: 0.75399; acc/pre_train: 0.40000; acc/post_train: 0.46000; nll/pre_train: 0.71045; perplexity/pre_train: 2.03491; nll/post_train: 0.73104; perplexity/post_train: 2.07725; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08762; loss/total_train: 0.03651; loss/total_edit_train: 0.03651; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 94.60183; lr/lr0_train: 0.00009; lr/lr1_train: 0.00006; lr/lr2_train: 0.00006; lr/lr3_train: 0.00002; lr/lr4_train: 0.00015; lr/lr5_train: 0.00009
[2023-03-25 14:55:02,006][trainer][INFO] - Step 2800:
[2023-03-25 14:55:02,006][trainer][INFO] - loss/edit_train: 0.24992; loss/loc_train: 0.00880; edit/acc_train: 1.00000; edit/log_prob_train: -0.24992; edit/prob_train: 0.78348; acc/pre_train: 0.43000; acc/post_train: 0.54000; nll/pre_train: 0.69234; perplexity/pre_train: 1.99838; nll/post_train: 0.70490; perplexity/post_train: 2.02364; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08685; loss/total_train: 0.03380; loss/total_edit_train: 0.03380; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 123.72794; lr/lr0_train: 0.00009; lr/lr1_train: 0.00006; lr/lr2_train: 0.00007; lr/lr3_train: 0.00002; lr/lr4_train: 0.00015; lr/lr5_train: 0.00009
[2023-03-25 14:55:17,776][trainer][INFO] - Step 2900:
[2023-03-25 14:55:17,776][trainer][INFO] - loss/edit_train: 0.24655; loss/loc_train: 0.00828; edit/acc_train: 0.99000; edit/log_prob_train: -0.24655; edit/prob_train: 0.78640; acc/pre_train: 0.53000; acc/post_train: 0.47000; nll/pre_train: 0.68622; perplexity/pre_train: 1.98619; nll/post_train: 0.76186; perplexity/post_train: 2.14225; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09016; loss/total_train: 0.03293; loss/total_edit_train: 0.03293; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 131.12010; lr/lr0_train: 0.00009; lr/lr1_train: 0.00006; lr/lr2_train: 0.00007; lr/lr3_train: 0.00002; lr/lr4_train: 0.00016; lr/lr5_train: 0.00009
[2023-03-25 14:55:33,602][trainer][INFO] - Step 3000:
[2023-03-25 14:55:33,602][trainer][INFO] - loss/edit_train: 0.24402; loss/loc_train: 0.00627; edit/acc_train: 1.00000; edit/log_prob_train: -0.24402; edit/prob_train: 0.78781; acc/pre_train: 0.49000; acc/post_train: 0.60000; nll/pre_train: 0.68138; perplexity/pre_train: 1.97660; nll/post_train: 0.68055; perplexity/post_train: 1.97496; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08783; loss/total_train: 0.03067; loss/total_edit_train: 0.03067; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 141.68592; lr/lr0_train: 0.00009; lr/lr1_train: 0.00006; lr/lr2_train: 0.00007; lr/lr3_train: 0.00002; lr/lr4_train: 0.00016; lr/lr5_train: 0.00010
[2023-03-25 14:55:49,603][trainer][INFO] - Step 3100:
[2023-03-25 14:55:49,603][trainer][INFO] - loss/edit_train: 0.24735; loss/loc_train: 0.00903; edit/acc_train: 0.99000; edit/log_prob_train: -0.24735; edit/prob_train: 0.78744; acc/pre_train: 0.54000; acc/post_train: 0.50000; nll/pre_train: 0.68121; perplexity/pre_train: 1.97626; nll/post_train: 0.71454; perplexity/post_train: 2.04325; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08693; loss/total_train: 0.03376; loss/total_edit_train: 0.03376; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 94.21591; lr/lr0_train: 0.00009; lr/lr1_train: 0.00006; lr/lr2_train: 0.00007; lr/lr3_train: 0.00002; lr/lr4_train: 0.00016; lr/lr5_train: 0.00010
[2023-03-25 14:56:05,525][trainer][INFO] - Step 3200:
[2023-03-25 14:56:05,526][trainer][INFO] - loss/edit_train: 0.25077; loss/loc_train: 0.00916; edit/acc_train: 0.98000; edit/log_prob_train: -0.25077; edit/prob_train: 0.78560; acc/pre_train: 0.46000; acc/post_train: 0.62000; nll/pre_train: 0.69298; perplexity/pre_train: 1.99967; nll/post_train: 0.70903; perplexity/post_train: 2.03202; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08809; loss/total_train: 0.03423; loss/total_edit_train: 0.03423; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 144.98898; lr/lr0_train: 0.00009; lr/lr1_train: 0.00006; lr/lr2_train: 0.00007; lr/lr3_train: 0.00002; lr/lr4_train: 0.00017; lr/lr5_train: 0.00010
[2023-03-25 14:56:21,149][trainer][INFO] - Step 3300:
[2023-03-25 14:56:21,149][trainer][INFO] - loss/edit_train: 0.21434; loss/loc_train: 0.00940; edit/acc_train: 1.00000; edit/log_prob_train: -0.21434; edit/prob_train: 0.81185; acc/pre_train: 0.38000; acc/post_train: 0.52000; nll/pre_train: 0.71170; perplexity/pre_train: 2.03744; nll/post_train: 0.70344; perplexity/post_train: 2.02069; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08593; loss/total_train: 0.03083; loss/total_edit_train: 0.03083; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 136.83434; lr/lr0_train: 0.00009; lr/lr1_train: 0.00006; lr/lr2_train: 0.00008; lr/lr3_train: 0.00002; lr/lr4_train: 0.00017; lr/lr5_train: 0.00011
[2023-03-25 14:56:36,773][trainer][INFO] - Step 3400:
[2023-03-25 14:56:36,774][trainer][INFO] - loss/edit_train: 0.25911; loss/loc_train: 0.00581; edit/acc_train: 0.98000; edit/log_prob_train: -0.25911; edit/prob_train: 0.78072; acc/pre_train: 0.37000; acc/post_train: 0.52000; nll/pre_train: 0.69920; perplexity/pre_train: 2.01215; nll/post_train: 0.69877; perplexity/post_train: 2.01127; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08748; loss/total_train: 0.03172; loss/total_edit_train: 0.03172; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 122.28697; lr/lr0_train: 0.00010; lr/lr1_train: 0.00006; lr/lr2_train: 0.00008; lr/lr3_train: 0.00002; lr/lr4_train: 0.00017; lr/lr5_train: 0.00011
[2023-03-25 14:56:52,462][trainer][INFO] - Step 3500:
[2023-03-25 14:56:52,462][trainer][INFO] - loss/edit_train: 0.20304; loss/loc_train: 0.01099; edit/acc_train: 0.99000; edit/log_prob_train: -0.20304; edit/prob_train: 0.82359; acc/pre_train: 0.47000; acc/post_train: 0.47000; nll/pre_train: 0.70698; perplexity/pre_train: 2.02786; nll/post_train: 0.73225; perplexity/post_train: 2.07976; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08743; loss/total_train: 0.03129; loss/total_edit_train: 0.03129; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 140.18231; lr/lr0_train: 0.00010; lr/lr1_train: 0.00006; lr/lr2_train: 0.00008; lr/lr3_train: 0.00002; lr/lr4_train: 0.00018; lr/lr5_train: 0.00012
[2023-03-25 14:57:08,365][trainer][INFO] - Step 3600:
[2023-03-25 14:57:08,365][trainer][INFO] - loss/edit_train: 0.18194; loss/loc_train: 0.01036; edit/acc_train: 1.00000; edit/log_prob_train: -0.18194; edit/prob_train: 0.83785; acc/pre_train: 0.34000; acc/post_train: 0.52000; nll/pre_train: 0.72107; perplexity/pre_train: 2.05664; nll/post_train: 0.75376; perplexity/post_train: 2.12497; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08517; loss/total_train: 0.02855; loss/total_edit_train: 0.02855; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 153.42613; lr/lr0_train: 0.00010; lr/lr1_train: 0.00006; lr/lr2_train: 0.00008; lr/lr3_train: 0.00002; lr/lr4_train: 0.00018; lr/lr5_train: 0.00012
[2023-03-25 14:57:24,351][trainer][INFO] - Step 3700:
[2023-03-25 14:57:24,351][trainer][INFO] - loss/edit_train: 0.18779; loss/loc_train: 0.00829; edit/acc_train: 1.00000; edit/log_prob_train: -0.18779; edit/prob_train: 0.83472; acc/pre_train: 0.41000; acc/post_train: 0.54000; nll/pre_train: 0.70340; perplexity/pre_train: 2.02060; nll/post_train: 0.74074; perplexity/post_train: 2.09750; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08627; loss/total_train: 0.02707; loss/total_edit_train: 0.02707; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 151.16315; lr/lr0_train: 0.00010; lr/lr1_train: 0.00006; lr/lr2_train: 0.00008; lr/lr3_train: 0.00002; lr/lr4_train: 0.00018; lr/lr5_train: 0.00012
[2023-03-25 14:57:40,555][trainer][INFO] - Step 3800:
[2023-03-25 14:57:40,556][trainer][INFO] - loss/edit_train: 0.18459; loss/loc_train: 0.00572; edit/acc_train: 1.00000; edit/log_prob_train: -0.18459; edit/prob_train: 0.83595; acc/pre_train: 0.41000; acc/post_train: 0.50000; nll/pre_train: 0.70223; perplexity/pre_train: 2.01825; nll/post_train: 0.73616; perplexity/post_train: 2.08790; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09075; loss/total_train: 0.02418; loss/total_edit_train: 0.02418; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 113.32171; lr/lr0_train: 0.00010; lr/lr1_train: 0.00006; lr/lr2_train: 0.00009; lr/lr3_train: 0.00002; lr/lr4_train: 0.00018; lr/lr5_train: 0.00013
[2023-03-25 14:57:56,783][trainer][INFO] - Step 3900:
[2023-03-25 14:57:56,784][trainer][INFO] - loss/edit_train: 0.14410; loss/loc_train: 0.00756; edit/acc_train: 1.00000; edit/log_prob_train: -0.14410; edit/prob_train: 0.87013; acc/pre_train: 0.38000; acc/post_train: 0.61000; nll/pre_train: 0.70532; perplexity/pre_train: 2.02449; nll/post_train: 0.66899; perplexity/post_train: 1.95227; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08848; loss/total_train: 0.02197; loss/total_edit_train: 0.02197; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 103.32264; lr/lr0_train: 0.00010; lr/lr1_train: 0.00006; lr/lr2_train: 0.00009; lr/lr3_train: 0.00002; lr/lr4_train: 0.00019; lr/lr5_train: 0.00014
[2023-03-25 14:58:12,520][trainer][INFO] - Step 4000:
[2023-03-25 14:58:12,520][trainer][INFO] - loss/edit_train: 0.14727; loss/loc_train: 0.01463; edit/acc_train: 0.99000; edit/log_prob_train: -0.14727; edit/prob_train: 0.86874; acc/pre_train: 0.47000; acc/post_train: 0.50000; nll/pre_train: 0.71176; perplexity/pre_train: 2.03757; nll/post_train: 0.71158; perplexity/post_train: 2.03722; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08537; loss/total_train: 0.02936; loss/total_edit_train: 0.02936; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 187.67708; lr/lr0_train: 0.00010; lr/lr1_train: 0.00007; lr/lr2_train: 0.00009; lr/lr3_train: 0.00002; lr/lr4_train: 0.00019; lr/lr5_train: 0.00014
[2023-03-25 14:58:28,266][trainer][INFO] - Step 4100:
[2023-03-25 14:58:28,267][trainer][INFO] - loss/edit_train: 0.15949; loss/loc_train: 0.00587; edit/acc_train: 1.00000; edit/log_prob_train: -0.15949; edit/prob_train: 0.85954; acc/pre_train: 0.34000; acc/post_train: 0.53000; nll/pre_train: 0.71868; perplexity/pre_train: 2.05173; nll/post_train: 0.70601; perplexity/post_train: 2.02590; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08868; loss/total_train: 0.02182; loss/total_edit_train: 0.02182; memory/alloc_max_train: 3356410368.00000; memory/res_max_train: 3818913792.00000; grad_train: 202.42547; lr/lr0_train: 0.00010; lr/lr1_train: 0.00007; lr/lr2_train: 0.00009; lr/lr3_train: 0.00001; lr/lr4_train: 0.00019; lr/lr5_train: 0.00014
[2023-03-25 14:58:44,293][trainer][INFO] - Step 4200:
[2023-03-25 14:58:44,293][trainer][INFO] - loss/edit_train: 0.13622; loss/loc_train: 0.00845; edit/acc_train: 1.00000; edit/log_prob_train: -0.13622; edit/prob_train: 0.87668; acc/pre_train: 0.41000; acc/post_train: 0.54000; nll/pre_train: 0.70554; perplexity/pre_train: 2.02494; nll/post_train: 0.72634; perplexity/post_train: 2.06750; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08742; loss/total_train: 0.02207; loss/total_edit_train: 0.02207; memory/alloc_max_train: 3356410393.60000; memory/res_max_train: 3818913792.00000; grad_train: 128.12811; lr/lr0_train: 0.00011; lr/lr1_train: 0.00006; lr/lr2_train: 0.00009; lr/lr3_train: 0.00001; lr/lr4_train: 0.00019; lr/lr5_train: 0.00014
[2023-03-25 14:59:00,323][trainer][INFO] - Step 4300:
[2023-03-25 14:59:00,323][trainer][INFO] - loss/edit_train: 0.13273; loss/loc_train: 0.00925; edit/acc_train: 1.00000; edit/log_prob_train: -0.13273; edit/prob_train: 0.87996; acc/pre_train: 0.40000; acc/post_train: 0.46000; nll/pre_train: 0.69674; perplexity/pre_train: 2.00719; nll/post_train: 0.71068; perplexity/post_train: 2.03537; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08647; loss/total_train: 0.02253; loss/total_edit_train: 0.02253; memory/alloc_max_train: 3356412928.00000; memory/res_max_train: 3818913792.00000; grad_train: 146.90480; lr/lr0_train: 0.00011; lr/lr1_train: 0.00006; lr/lr2_train: 0.00009; lr/lr3_train: 0.00001; lr/lr4_train: 0.00019; lr/lr5_train: 0.00015
[2023-03-25 14:59:16,696][trainer][INFO] - Step 4400:
[2023-03-25 14:59:16,697][trainer][INFO] - loss/edit_train: 0.15887; loss/loc_train: 0.00517; edit/acc_train: 0.99000; edit/log_prob_train: -0.15887; edit/prob_train: 0.86081; acc/pre_train: 0.46000; acc/post_train: 0.51000; nll/pre_train: 0.69127; perplexity/pre_train: 1.99625; nll/post_train: 0.69800; perplexity/post_train: 2.00972; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09201; loss/total_train: 0.02105; loss/total_edit_train: 0.02105; memory/alloc_max_train: 3356412928.00000; memory/res_max_train: 3818913792.00000; grad_train: 155.12519; lr/lr0_train: 0.00011; lr/lr1_train: 0.00007; lr/lr2_train: 0.00009; lr/lr3_train: 0.00001; lr/lr4_train: 0.00019; lr/lr5_train: 0.00015
[2023-03-25 14:59:32,394][trainer][INFO] - Step 4500:
[2023-03-25 14:59:32,394][trainer][INFO] - loss/edit_train: 0.12945; loss/loc_train: 0.01152; edit/acc_train: 1.00000; edit/log_prob_train: -0.12945; edit/prob_train: 0.88508; acc/pre_train: 0.48000; acc/post_train: 0.59000; nll/pre_train: 0.67755; perplexity/pre_train: 1.96905; nll/post_train: 0.69732; perplexity/post_train: 2.00837; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09127; loss/total_train: 0.02447; loss/total_edit_train: 0.02447; memory/alloc_max_train: 3356412928.00000; memory/res_max_train: 3818913792.00000; grad_train: 196.74111; lr/lr0_train: 0.00011; lr/lr1_train: 0.00007; lr/lr2_train: 0.00009; lr/lr3_train: 0.00001; lr/lr4_train: 0.00019; lr/lr5_train: 0.00015
[2023-03-25 14:59:48,346][trainer][INFO] - Step 4600:
[2023-03-25 14:59:48,346][trainer][INFO] - loss/edit_train: 0.12023; loss/loc_train: 0.00516; edit/acc_train: 1.00000; edit/log_prob_train: -0.12023; edit/prob_train: 0.89097; acc/pre_train: 0.42000; acc/post_train: 0.43000; nll/pre_train: 0.71096; perplexity/pre_train: 2.03595; nll/post_train: 0.74233; perplexity/post_train: 2.10082; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08688; loss/total_train: 0.01718; loss/total_edit_train: 0.01718; memory/alloc_max_train: 3356412928.00000; memory/res_max_train: 3818913792.00000; grad_train: 137.06706; lr/lr0_train: 0.00011; lr/lr1_train: 0.00007; lr/lr2_train: 0.00009; lr/lr3_train: 0.00001; lr/lr4_train: 0.00019; lr/lr5_train: 0.00015
[2023-03-25 15:00:04,887][trainer][INFO] - Step 4700:
[2023-03-25 15:00:04,888][trainer][INFO] - loss/edit_train: 0.14297; loss/loc_train: 0.01024; edit/acc_train: 1.00000; edit/log_prob_train: -0.14297; edit/prob_train: 0.87421; acc/pre_train: 0.40000; acc/post_train: 0.50000; nll/pre_train: 0.70360; perplexity/pre_train: 2.02101; nll/post_train: 0.74849; perplexity/post_train: 2.11380; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09052; loss/total_train: 0.02454; loss/total_edit_train: 0.02454; memory/alloc_max_train: 3356412928.00000; memory/res_max_train: 3818913792.00000; grad_train: 161.11827; lr/lr0_train: 0.00012; lr/lr1_train: 0.00007; lr/lr2_train: 0.00009; lr/lr3_train: 0.00001; lr/lr4_train: 0.00019; lr/lr5_train: 0.00015
[2023-03-25 15:00:20,849][trainer][INFO] - Step 4800:
[2023-03-25 15:00:20,849][trainer][INFO] - loss/edit_train: 0.12380; loss/loc_train: 0.00483; edit/acc_train: 0.99000; edit/log_prob_train: -0.12380; edit/prob_train: 0.89003; acc/pre_train: 0.40000; acc/post_train: 0.55000; nll/pre_train: 0.71012; perplexity/pre_train: 2.03423; nll/post_train: 0.69937; perplexity/post_train: 2.01248; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09190; loss/total_train: 0.01721; loss/total_edit_train: 0.01721; memory/alloc_max_train: 3356412928.00000; memory/res_max_train: 3818913792.00000; grad_train: 150.04494; lr/lr0_train: 0.00012; lr/lr1_train: 0.00007; lr/lr2_train: 0.00009; lr/lr3_train: 0.00001; lr/lr4_train: 0.00019; lr/lr5_train: 0.00015
[2023-03-25 15:00:37,078][trainer][INFO] - Step 4900:
[2023-03-25 15:00:37,078][trainer][INFO] - loss/edit_train: 0.10970; loss/loc_train: 0.00770; edit/acc_train: 0.99000; edit/log_prob_train: -0.10970; edit/prob_train: 0.90070; acc/pre_train: 0.38000; acc/post_train: 0.57000; nll/pre_train: 0.70180; perplexity/pre_train: 2.01737; nll/post_train: 0.71818; perplexity/post_train: 2.05070; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08757; loss/total_train: 0.01867; loss/total_edit_train: 0.01867; memory/alloc_max_train: 3356412928.00000; memory/res_max_train: 3818913792.00000; grad_train: 139.14756; lr/lr0_train: 0.00012; lr/lr1_train: 0.00007; lr/lr2_train: 0.00009; lr/lr3_train: 0.00001; lr/lr4_train: 0.00020; lr/lr5_train: 0.00016
[2023-03-25 15:00:53,304][trainer][INFO] - Step 5000:
[2023-03-25 15:00:53,304][trainer][INFO] - loss/edit_train: 0.11627; loss/loc_train: 0.00907; edit/acc_train: 1.00000; edit/log_prob_train: -0.11627; edit/prob_train: 0.89577; acc/pre_train: 0.42000; acc/post_train: 0.45000; nll/pre_train: 0.69762; perplexity/pre_train: 2.00897; nll/post_train: 0.70034; perplexity/post_train: 2.01444; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08803; loss/total_train: 0.02070; loss/total_edit_train: 0.02070; memory/alloc_max_train: 3356412928.00000; memory/res_max_train: 3818913792.00000; grad_train: 267.18247; lr/lr0_train: 0.00012; lr/lr1_train: 0.00008; lr/lr2_train: 0.00009; lr/lr3_train: 0.00001; lr/lr4_train: 0.00020; lr/lr5_train: 0.00016
[2023-03-25 15:01:52,579][trainer][INFO] - Step 5000:
[2023-03-25 15:01:52,580][trainer][INFO] - loss/edit_val: 0.12405; loss/loc_val: 0.01134; edit/acc_val: 0.99800; edit/log_prob_val: -0.12405; edit/prob_val: 0.88997; acc/pre_val: 0.47600; acc/post_val: 0.49950; nll/pre_val: 0.69428; perplexity/pre_val: 2.00227; nll/post_val: 0.72796; perplexity/post_val: 2.07085; n_tokens/pre_val: 4.00000; n_tokens/post_val: 4.00000; time/edit_val: 0.08001; loss/total_val: 0.02375; loss/total_edit_val: 0.02375; memory/alloc_max_val: 3408051706.88000; memory/res_max_val: 3818913792.00000; eval_time/elapsed: 59.24975; eval_time/average: 0.11850
[2023-03-25 15:01:52,583][trainer][INFO] - Saving model to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-47-21_2742039763/models/bert-base-multilingual-uncased.2023-03-25_14-47-21_2742039763
[2023-03-25 15:01:52,583][trainer][INFO] - Moving old archive to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-47-21_2742039763/models/bert-base-multilingual-uncased.2023-03-25_14-47-21_2742039763.bk
[2023-03-25 15:01:53,628][trainer][INFO] - Write complete.
[2023-03-25 15:02:09,908][trainer][INFO] - Step 5100:
[2023-03-25 15:02:09,909][trainer][INFO] - loss/edit_train: 0.14868; loss/loc_train: 0.00843; edit/acc_train: 0.99000; edit/log_prob_train: -0.14868; edit/prob_train: 0.87031; acc/pre_train: 0.40000; acc/post_train: 0.54000; nll/pre_train: 0.70783; perplexity/pre_train: 2.02959; nll/post_train: 0.65550; perplexity/post_train: 1.92611; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08780; loss/total_train: 0.02330; loss/total_edit_train: 0.02330; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 159.86229; lr/lr0_train: 0.00012; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00001; lr/lr4_train: 0.00020; lr/lr5_train: 0.00016
[2023-03-25 15:02:26,364][trainer][INFO] - Step 5200:
[2023-03-25 15:02:26,365][trainer][INFO] - loss/edit_train: 0.14729; loss/loc_train: 0.00794; edit/acc_train: 1.00000; edit/log_prob_train: -0.14729; edit/prob_train: 0.87144; acc/pre_train: 0.42000; acc/post_train: 0.42000; nll/pre_train: 0.70494; perplexity/pre_train: 2.02372; nll/post_train: 0.77210; perplexity/post_train: 2.16432; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08860; loss/total_train: 0.02267; loss/total_edit_train: 0.02267; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 104.23131; lr/lr0_train: 0.00013; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00001; lr/lr4_train: 0.00020; lr/lr5_train: 0.00016
[2023-03-25 15:02:42,871][trainer][INFO] - Step 5300:
[2023-03-25 15:02:42,871][trainer][INFO] - loss/edit_train: 0.11067; loss/loc_train: 0.00667; edit/acc_train: 0.99000; edit/log_prob_train: -0.11067; edit/prob_train: 0.90193; acc/pre_train: 0.48000; acc/post_train: 0.53000; nll/pre_train: 0.69764; perplexity/pre_train: 2.00901; nll/post_train: 0.73792; perplexity/post_train: 2.09158; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09173; loss/total_train: 0.01773; loss/total_edit_train: 0.01773; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 65.83508; lr/lr0_train: 0.00013; lr/lr1_train: 0.00007; lr/lr2_train: 0.00010; lr/lr3_train: 0.00001; lr/lr4_train: 0.00020; lr/lr5_train: 0.00016
[2023-03-25 15:02:58,386][trainer][INFO] - Step 5400:
[2023-03-25 15:02:58,386][trainer][INFO] - loss/edit_train: 0.13576; loss/loc_train: 0.00834; edit/acc_train: 1.00000; edit/log_prob_train: -0.13576; edit/prob_train: 0.88169; acc/pre_train: 0.44000; acc/post_train: 0.49000; nll/pre_train: 0.69983; perplexity/pre_train: 2.01341; nll/post_train: 0.72740; perplexity/post_train: 2.06970; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08472; loss/total_train: 0.02191; loss/total_edit_train: 0.02191; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 115.87355; lr/lr0_train: 0.00013; lr/lr1_train: 0.00007; lr/lr2_train: 0.00010; lr/lr3_train: 0.00001; lr/lr4_train: 0.00020; lr/lr5_train: 0.00016
[2023-03-25 15:03:14,821][trainer][INFO] - Step 5500:
[2023-03-25 15:03:14,822][trainer][INFO] - loss/edit_train: 0.11128; loss/loc_train: 0.01409; edit/acc_train: 0.99000; edit/log_prob_train: -0.11128; edit/prob_train: 0.90353; acc/pre_train: 0.34000; acc/post_train: 0.52000; nll/pre_train: 0.71484; perplexity/pre_train: 2.04385; nll/post_train: 0.75546; perplexity/post_train: 2.12859; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08940; loss/total_train: 0.02522; loss/total_edit_train: 0.02522; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 233.64366; lr/lr0_train: 0.00013; lr/lr1_train: 0.00007; lr/lr2_train: 0.00010; lr/lr3_train: 0.00001; lr/lr4_train: 0.00020; lr/lr5_train: 0.00016
[2023-03-25 15:03:31,336][trainer][INFO] - Step 5600:
[2023-03-25 15:03:31,337][trainer][INFO] - loss/edit_train: 0.13730; loss/loc_train: 0.00289; edit/acc_train: 1.00000; edit/log_prob_train: -0.13730; edit/prob_train: 0.87893; acc/pre_train: 0.46000; acc/post_train: 0.51000; nll/pre_train: 0.69530; perplexity/pre_train: 2.00430; nll/post_train: 0.68415; perplexity/post_train: 1.98209; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09157; loss/total_train: 0.01662; loss/total_edit_train: 0.01662; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 112.90364; lr/lr0_train: 0.00013; lr/lr1_train: 0.00007; lr/lr2_train: 0.00010; lr/lr3_train: 0.00001; lr/lr4_train: 0.00020; lr/lr5_train: 0.00016
[2023-03-25 15:03:47,061][trainer][INFO] - Step 5700:
[2023-03-25 15:03:47,061][trainer][INFO] - loss/edit_train: 0.09865; loss/loc_train: 0.00776; edit/acc_train: 1.00000; edit/log_prob_train: -0.09865; edit/prob_train: 0.91048; acc/pre_train: 0.43000; acc/post_train: 0.45000; nll/pre_train: 0.70748; perplexity/pre_train: 2.02887; nll/post_train: 0.74811; perplexity/post_train: 2.11301; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08585; loss/total_train: 0.01763; loss/total_edit_train: 0.01763; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 146.86242; lr/lr0_train: 0.00013; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00001; lr/lr4_train: 0.00020; lr/lr5_train: 0.00016
[2023-03-25 15:04:02,629][trainer][INFO] - Step 5800:
[2023-03-25 15:04:02,629][trainer][INFO] - loss/edit_train: 0.11345; loss/loc_train: 0.00378; edit/acc_train: 1.00000; edit/log_prob_train: -0.11345; edit/prob_train: 0.89932; acc/pre_train: 0.44000; acc/post_train: 0.53000; nll/pre_train: 0.70285; perplexity/pre_train: 2.01950; nll/post_train: 0.72407; perplexity/post_train: 2.06281; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08549; loss/total_train: 0.01512; loss/total_edit_train: 0.01512; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 175.18300; lr/lr0_train: 0.00013; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00000; lr/lr4_train: 0.00020; lr/lr5_train: 0.00016
[2023-03-25 15:04:18,624][trainer][INFO] - Step 5900:
[2023-03-25 15:04:18,625][trainer][INFO] - loss/edit_train: 0.13336; loss/loc_train: 0.00799; edit/acc_train: 0.96000; edit/log_prob_train: -0.13336; edit/prob_train: 0.88884; acc/pre_train: 0.38000; acc/post_train: 0.58000; nll/pre_train: 0.69581; perplexity/pre_train: 2.00532; nll/post_train: 0.70511; perplexity/post_train: 2.02408; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09085; loss/total_train: 0.02133; loss/total_edit_train: 0.02133; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 136.55680; lr/lr0_train: 0.00013; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00000; lr/lr4_train: 0.00021; lr/lr5_train: 0.00016
[2023-03-25 15:04:34,926][trainer][INFO] - Step 6000:
[2023-03-25 15:04:34,926][trainer][INFO] - loss/edit_train: 0.13339; loss/loc_train: 0.00242; edit/acc_train: 0.99000; edit/log_prob_train: -0.13339; edit/prob_train: 0.88369; acc/pre_train: 0.39000; acc/post_train: 0.45000; nll/pre_train: 0.70080; perplexity/pre_train: 2.01537; nll/post_train: 0.72026; perplexity/post_train: 2.05497; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09165; loss/total_train: 0.01576; loss/total_edit_train: 0.01576; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 98.53197; lr/lr0_train: 0.00013; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00000; lr/lr4_train: 0.00021; lr/lr5_train: 0.00016
[2023-03-25 15:04:50,804][trainer][INFO] - Step 6100:
[2023-03-25 15:04:50,805][trainer][INFO] - loss/edit_train: 0.08802; loss/loc_train: 0.00554; edit/acc_train: 1.00000; edit/log_prob_train: -0.08802; edit/prob_train: 0.92012; acc/pre_train: 0.44000; acc/post_train: 0.58000; nll/pre_train: 0.69792; perplexity/pre_train: 2.00957; nll/post_train: 0.70761; perplexity/post_train: 2.02914; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08530; loss/total_train: 0.01434; loss/total_edit_train: 0.01434; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 86.00109; lr/lr0_train: 0.00014; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00000; lr/lr4_train: 0.00021; lr/lr5_train: 0.00017
[2023-03-25 15:05:06,491][trainer][INFO] - Step 6200:
[2023-03-25 15:05:06,492][trainer][INFO] - loss/edit_train: 0.08818; loss/loc_train: 0.01294; edit/acc_train: 1.00000; edit/log_prob_train: -0.08818; edit/prob_train: 0.92064; acc/pre_train: 0.47000; acc/post_train: 0.55000; nll/pre_train: 0.69601; perplexity/pre_train: 2.00574; nll/post_train: 0.72390; perplexity/post_train: 2.06246; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08842; loss/total_train: 0.02175; loss/total_edit_train: 0.02175; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 139.85185; lr/lr0_train: 0.00014; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: -0.00000; lr/lr4_train: 0.00021; lr/lr5_train: 0.00017
[2023-03-25 15:05:22,823][trainer][INFO] - Step 6300:
[2023-03-25 15:05:22,823][trainer][INFO] - loss/edit_train: 0.09394; loss/loc_train: 0.00329; edit/acc_train: 0.99000; edit/log_prob_train: -0.09394; edit/prob_train: 0.91540; acc/pre_train: 0.37000; acc/post_train: 0.54000; nll/pre_train: 0.70368; perplexity/pre_train: 2.02118; nll/post_train: 0.69895; perplexity/post_train: 2.01165; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09008; loss/total_train: 0.01268; loss/total_edit_train: 0.01268; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 117.29543; lr/lr0_train: 0.00014; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: -0.00000; lr/lr4_train: 0.00021; lr/lr5_train: 0.00017
[2023-03-25 15:05:38,528][trainer][INFO] - Step 6400:
[2023-03-25 15:05:38,529][trainer][INFO] - loss/edit_train: 0.09545; loss/loc_train: 0.00595; edit/acc_train: 0.98000; edit/log_prob_train: -0.09545; edit/prob_train: 0.91739; acc/pre_train: 0.46000; acc/post_train: 0.44000; nll/pre_train: 0.69177; perplexity/pre_train: 1.99724; nll/post_train: 0.75246; perplexity/post_train: 2.12220; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08591; loss/total_train: 0.01549; loss/total_edit_train: 0.01549; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 131.75498; lr/lr0_train: 0.00014; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00000; lr/lr4_train: 0.00021; lr/lr5_train: 0.00017
[2023-03-25 15:05:54,050][trainer][INFO] - Step 6500:
[2023-03-25 15:05:54,051][trainer][INFO] - loss/edit_train: 0.06252; loss/loc_train: 0.00799; edit/acc_train: 1.00000; edit/log_prob_train: -0.06252; edit/prob_train: 0.94114; acc/pre_train: 0.44000; acc/post_train: 0.58000; nll/pre_train: 0.70310; perplexity/pre_train: 2.02000; nll/post_train: 0.69370; perplexity/post_train: 2.00110; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08296; loss/total_train: 0.01424; loss/total_edit_train: 0.01424; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 82.28974; lr/lr0_train: 0.00014; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00001; lr/lr4_train: 0.00021; lr/lr5_train: 0.00017
[2023-03-25 15:06:10,250][trainer][INFO] - Step 6600:
[2023-03-25 15:06:10,251][trainer][INFO] - loss/edit_train: 0.09219; loss/loc_train: 0.00892; edit/acc_train: 1.00000; edit/log_prob_train: -0.09219; edit/prob_train: 0.91641; acc/pre_train: 0.36000; acc/post_train: 0.44000; nll/pre_train: 0.70420; perplexity/pre_train: 2.02223; nll/post_train: 0.70949; perplexity/post_train: 2.03296; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08829; loss/total_train: 0.01814; loss/total_edit_train: 0.01814; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 99.75552; lr/lr0_train: 0.00014; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00021; lr/lr5_train: 0.00017
[2023-03-25 15:06:26,164][trainer][INFO] - Step 6700:
[2023-03-25 15:06:26,165][trainer][INFO] - loss/edit_train: 0.09415; loss/loc_train: 0.00626; edit/acc_train: 0.98000; edit/log_prob_train: -0.09415; edit/prob_train: 0.91825; acc/pre_train: 0.54000; acc/post_train: 0.46000; nll/pre_train: 0.68784; perplexity/pre_train: 1.98942; nll/post_train: 0.70966; perplexity/post_train: 2.03331; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08679; loss/total_train: 0.01567; loss/total_edit_train: 0.01567; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 216.65246; lr/lr0_train: 0.00014; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00021; lr/lr5_train: 0.00017
[2023-03-25 15:06:42,124][trainer][INFO] - Step 6800:
[2023-03-25 15:06:42,125][trainer][INFO] - loss/edit_train: 0.07336; loss/loc_train: 0.00348; edit/acc_train: 0.99000; edit/log_prob_train: -0.07336; edit/prob_train: 0.93335; acc/pre_train: 0.44000; acc/post_train: 0.45000; nll/pre_train: 0.69503; perplexity/pre_train: 2.00377; nll/post_train: 0.71050; perplexity/post_train: 2.03500; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08717; loss/total_train: 0.01082; loss/total_edit_train: 0.01082; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 93.83592; lr/lr0_train: 0.00014; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00021; lr/lr5_train: 0.00017
[2023-03-25 15:06:57,670][trainer][INFO] - Step 6900:
[2023-03-25 15:06:57,670][trainer][INFO] - loss/edit_train: 0.06494; loss/loc_train: 0.00747; edit/acc_train: 0.99000; edit/log_prob_train: -0.06494; edit/prob_train: 0.94032; acc/pre_train: 0.44000; acc/post_train: 0.46000; nll/pre_train: 0.70030; perplexity/pre_train: 2.01435; nll/post_train: 0.73895; perplexity/post_train: 2.09374; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08347; loss/total_train: 0.01396; loss/total_edit_train: 0.01396; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 114.65448; lr/lr0_train: 0.00014; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00021; lr/lr5_train: 0.00018
[2023-03-25 15:07:13,595][trainer][INFO] - Step 7000:
[2023-03-25 15:07:13,595][trainer][INFO] - loss/edit_train: 0.05524; loss/loc_train: 0.00489; edit/acc_train: 1.00000; edit/log_prob_train: -0.05524; edit/prob_train: 0.94761; acc/pre_train: 0.48000; acc/post_train: 0.62000; nll/pre_train: 0.69145; perplexity/pre_train: 1.99661; nll/post_train: 0.66824; perplexity/post_train: 1.95081; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08816; loss/total_train: 0.01042; loss/total_edit_train: 0.01042; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 174.04631; lr/lr0_train: 0.00014; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00021; lr/lr5_train: 0.00018
[2023-03-25 15:07:28,339][trainer][INFO] - Step 7100:
[2023-03-25 15:07:28,339][trainer][INFO] - loss/edit_train: 0.08366; loss/loc_train: 0.00401; edit/acc_train: 1.00000; edit/log_prob_train: -0.08366; edit/prob_train: 0.92555; acc/pre_train: 0.47000; acc/post_train: 0.54000; nll/pre_train: 0.69989; perplexity/pre_train: 2.01354; nll/post_train: 0.70725; perplexity/post_train: 2.02840; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08084; loss/total_train: 0.01238; loss/total_edit_train: 0.01238; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 101.51530; lr/lr0_train: 0.00014; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00021; lr/lr5_train: 0.00018
[2023-03-25 15:07:43,494][trainer][INFO] - Step 7200:
[2023-03-25 15:07:43,495][trainer][INFO] - loss/edit_train: 0.07112; loss/loc_train: 0.00442; edit/acc_train: 1.00000; edit/log_prob_train: -0.07112; edit/prob_train: 0.93523; acc/pre_train: 0.39000; acc/post_train: 0.46000; nll/pre_train: 0.71774; perplexity/pre_train: 2.04980; nll/post_train: 0.72105; perplexity/post_train: 2.05660; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08341; loss/total_train: 0.01153; loss/total_edit_train: 0.01153; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 175.07061; lr/lr0_train: 0.00014; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00021; lr/lr5_train: 0.00018
[2023-03-25 15:07:59,079][trainer][INFO] - Step 7300:
[2023-03-25 15:07:59,079][trainer][INFO] - loss/edit_train: 0.06749; loss/loc_train: 0.00620; edit/acc_train: 0.99000; edit/log_prob_train: -0.06749; edit/prob_train: 0.93848; acc/pre_train: 0.53000; acc/post_train: 0.46000; nll/pre_train: 0.68144; perplexity/pre_train: 1.97672; nll/post_train: 0.75606; perplexity/post_train: 2.12987; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08579; loss/total_train: 0.01295; loss/total_edit_train: 0.01295; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 205.83220; lr/lr0_train: 0.00015; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00021; lr/lr5_train: 0.00018
[2023-03-25 15:08:15,626][trainer][INFO] - Step 7400:
[2023-03-25 15:08:15,627][trainer][INFO] - loss/edit_train: 0.06150; loss/loc_train: 0.00139; edit/acc_train: 0.99000; edit/log_prob_train: -0.06150; edit/prob_train: 0.94384; acc/pre_train: 0.51000; acc/post_train: 0.52000; nll/pre_train: 0.70329; perplexity/pre_train: 2.02039; nll/post_train: 0.70334; perplexity/post_train: 2.02049; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09135; loss/total_train: 0.00754; loss/total_edit_train: 0.00754; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 55.09321; lr/lr0_train: 0.00015; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00021; lr/lr5_train: 0.00018
[2023-03-25 15:08:31,759][trainer][INFO] - Step 7500:
[2023-03-25 15:08:31,760][trainer][INFO] - loss/edit_train: 0.05683; loss/loc_train: 0.00484; edit/acc_train: 1.00000; edit/log_prob_train: -0.05683; edit/prob_train: 0.94741; acc/pre_train: 0.40000; acc/post_train: 0.49000; nll/pre_train: 0.70229; perplexity/pre_train: 2.01838; nll/post_train: 0.72383; perplexity/post_train: 2.06231; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09180; loss/total_train: 0.01052; loss/total_edit_train: 0.01052; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 67.53718; lr/lr0_train: 0.00015; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00018
[2023-03-25 15:08:47,262][trainer][INFO] - Step 7600:
[2023-03-25 15:08:47,262][trainer][INFO] - loss/edit_train: 0.05352; loss/loc_train: 0.00688; edit/acc_train: 1.00000; edit/log_prob_train: -0.05352; edit/prob_train: 0.95040; acc/pre_train: 0.47000; acc/post_train: 0.58000; nll/pre_train: 0.68844; perplexity/pre_train: 1.99060; nll/post_train: 0.72272; perplexity/post_train: 2.06003; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08842; loss/total_train: 0.01223; loss/total_edit_train: 0.01223; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 103.36618; lr/lr0_train: 0.00015; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00018
[2023-03-25 15:09:03,687][trainer][INFO] - Step 7700:
[2023-03-25 15:09:03,687][trainer][INFO] - loss/edit_train: 0.05136; loss/loc_train: 0.00149; edit/acc_train: 1.00000; edit/log_prob_train: -0.05136; edit/prob_train: 0.95233; acc/pre_train: 0.43000; acc/post_train: 0.47000; nll/pre_train: 0.70719; perplexity/pre_train: 2.02829; nll/post_train: 0.70883; perplexity/post_train: 2.03160; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08808; loss/total_train: 0.00663; loss/total_edit_train: 0.00663; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 54.61517; lr/lr0_train: 0.00015; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00018
[2023-03-25 15:09:19,655][trainer][INFO] - Step 7800:
[2023-03-25 15:09:19,655][trainer][INFO] - loss/edit_train: 0.05811; loss/loc_train: 0.00552; edit/acc_train: 1.00000; edit/log_prob_train: -0.05811; edit/prob_train: 0.94723; acc/pre_train: 0.45000; acc/post_train: 0.51000; nll/pre_train: 0.68735; perplexity/pre_train: 1.98844; nll/post_train: 0.71347; perplexity/post_train: 2.04107; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08866; loss/total_train: 0.01133; loss/total_edit_train: 0.01133; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 247.30019; lr/lr0_train: 0.00015; lr/lr1_train: 0.00008; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:09:35,584][trainer][INFO] - Step 7900:
[2023-03-25 15:09:35,585][trainer][INFO] - loss/edit_train: 0.06969; loss/loc_train: 0.00367; edit/acc_train: 0.99000; edit/log_prob_train: -0.06969; edit/prob_train: 0.93819; acc/pre_train: 0.42000; acc/post_train: 0.48000; nll/pre_train: 0.70141; perplexity/pre_train: 2.01660; nll/post_train: 0.70596; perplexity/post_train: 2.02578; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08519; loss/total_train: 0.01064; loss/total_edit_train: 0.01064; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 272.85430; lr/lr0_train: 0.00015; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:09:52,482][trainer][INFO] - Step 8000:
[2023-03-25 15:09:52,483][trainer][INFO] - loss/edit_train: 0.05614; loss/loc_train: 0.00223; edit/acc_train: 1.00000; edit/log_prob_train: -0.05614; edit/prob_train: 0.94905; acc/pre_train: 0.42000; acc/post_train: 0.48000; nll/pre_train: 0.70232; perplexity/pre_train: 2.01844; nll/post_train: 0.69684; perplexity/post_train: 2.00741; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09446; loss/total_train: 0.00784; loss/total_edit_train: 0.00784; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 60.89093; lr/lr0_train: 0.00015; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:10:08,213][trainer][INFO] - Step 8100:
[2023-03-25 15:10:08,214][trainer][INFO] - loss/edit_train: 0.04148; loss/loc_train: 0.00292; edit/acc_train: 1.00000; edit/log_prob_train: -0.04148; edit/prob_train: 0.96070; acc/pre_train: 0.46000; acc/post_train: 0.46000; nll/pre_train: 0.70772; perplexity/pre_train: 2.02936; nll/post_train: 0.71746; perplexity/post_train: 2.04923; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08737; loss/total_train: 0.00707; loss/total_edit_train: 0.00707; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 77.36218; lr/lr0_train: 0.00016; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:10:24,320][trainer][INFO] - Step 8200:
[2023-03-25 15:10:24,320][trainer][INFO] - loss/edit_train: 0.05758; loss/loc_train: 0.00623; edit/acc_train: 1.00000; edit/log_prob_train: -0.05758; edit/prob_train: 0.94790; acc/pre_train: 0.46000; acc/post_train: 0.47000; nll/pre_train: 0.69447; perplexity/pre_train: 2.00266; nll/post_train: 0.69503; perplexity/post_train: 2.00377; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08822; loss/total_train: 0.01199; loss/total_edit_train: 0.01199; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 58.25677; lr/lr0_train: 0.00016; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:10:39,994][trainer][INFO] - Step 8300:
[2023-03-25 15:10:39,995][trainer][INFO] - loss/edit_train: 0.07059; loss/loc_train: 0.00239; edit/acc_train: 1.00000; edit/log_prob_train: -0.07059; edit/prob_train: 0.93807; acc/pre_train: 0.46000; acc/post_train: 0.50000; nll/pre_train: 0.69413; perplexity/pre_train: 2.00197; nll/post_train: 0.67280; perplexity/post_train: 1.95972; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08859; loss/total_train: 0.00945; loss/total_edit_train: 0.00945; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 121.26177; lr/lr0_train: 0.00016; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:10:55,778][trainer][INFO] - Step 8400:
[2023-03-25 15:10:55,778][trainer][INFO] - loss/edit_train: 0.05363; loss/loc_train: 0.00646; edit/acc_train: 1.00000; edit/log_prob_train: -0.05363; edit/prob_train: 0.95035; acc/pre_train: 0.43000; acc/post_train: 0.44000; nll/pre_train: 0.70533; perplexity/pre_train: 2.02451; nll/post_train: 0.73877; perplexity/post_train: 2.09336; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08834; loss/total_train: 0.01182; loss/total_edit_train: 0.01182; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 169.04767; lr/lr0_train: 0.00016; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:11:11,909][trainer][INFO] - Step 8500:
[2023-03-25 15:11:11,910][trainer][INFO] - loss/edit_train: 0.03664; loss/loc_train: 0.01944; edit/acc_train: 1.00000; edit/log_prob_train: -0.03664; edit/prob_train: 0.96478; acc/pre_train: 0.37000; acc/post_train: 0.55000; nll/pre_train: 0.71145; perplexity/pre_train: 2.03695; nll/post_train: 0.66173; perplexity/post_train: 1.93815; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09087; loss/total_train: 0.02311; loss/total_edit_train: 0.02311; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 95.84151; lr/lr0_train: 0.00016; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:11:27,139][trainer][INFO] - Step 8600:
[2023-03-25 15:11:27,140][trainer][INFO] - loss/edit_train: 0.03921; loss/loc_train: 0.00992; edit/acc_train: 1.00000; edit/log_prob_train: -0.03921; edit/prob_train: 0.96244; acc/pre_train: 0.40000; acc/post_train: 0.48000; nll/pre_train: 0.70077; perplexity/pre_train: 2.01531; nll/post_train: 0.72503; perplexity/post_train: 2.06479; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08227; loss/total_train: 0.01384; loss/total_edit_train: 0.01384; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 81.74914; lr/lr0_train: 0.00016; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:11:42,584][trainer][INFO] - Step 8700:
[2023-03-25 15:11:42,584][trainer][INFO] - loss/edit_train: 0.09106; loss/loc_train: 0.00651; edit/acc_train: 0.95000; edit/log_prob_train: -0.09106; edit/prob_train: 0.92362; acc/pre_train: 0.39000; acc/post_train: 0.49000; nll/pre_train: 0.70321; perplexity/pre_train: 2.02024; nll/post_train: 0.72885; perplexity/post_train: 2.07269; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08456; loss/total_train: 0.01562; loss/total_edit_train: 0.01562; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 123.20649; lr/lr0_train: 0.00016; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:11:58,596][trainer][INFO] - Step 8800:
[2023-03-25 15:11:58,596][trainer][INFO] - loss/edit_train: 0.06344; loss/loc_train: 0.00414; edit/acc_train: 1.00000; edit/log_prob_train: -0.06344; edit/prob_train: 0.94265; acc/pre_train: 0.41000; acc/post_train: 0.49000; nll/pre_train: 0.70385; perplexity/pre_train: 2.02152; nll/post_train: 0.72500; perplexity/post_train: 2.06473; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09107; loss/total_train: 0.01048; loss/total_edit_train: 0.01048; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 93.24773; lr/lr0_train: 0.00016; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:12:14,607][trainer][INFO] - Step 8900:
[2023-03-25 15:12:14,608][trainer][INFO] - loss/edit_train: 0.03748; loss/loc_train: 0.00800; edit/acc_train: 1.00000; edit/log_prob_train: -0.03748; edit/prob_train: 0.96506; acc/pre_train: 0.45000; acc/post_train: 0.50000; nll/pre_train: 0.69613; perplexity/pre_train: 2.00597; nll/post_train: 0.72940; perplexity/post_train: 2.07384; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08840; loss/total_train: 0.01175; loss/total_edit_train: 0.01175; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 69.97760; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:12:30,586][trainer][INFO] - Step 9000:
[2023-03-25 15:12:30,587][trainer][INFO] - loss/edit_train: 0.03171; loss/loc_train: 0.00709; edit/acc_train: 1.00000; edit/log_prob_train: -0.03171; edit/prob_train: 0.96922; acc/pre_train: 0.40000; acc/post_train: 0.44000; nll/pre_train: 0.70552; perplexity/pre_train: 2.02490; nll/post_train: 0.76067; perplexity/post_train: 2.13970; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08762; loss/total_train: 0.01027; loss/total_edit_train: 0.01027; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 123.97944; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:12:46,645][trainer][INFO] - Step 9100:
[2023-03-25 15:12:46,645][trainer][INFO] - loss/edit_train: 0.06173; loss/loc_train: 0.00234; edit/acc_train: 0.98000; edit/log_prob_train: -0.06173; edit/prob_train: 0.94581; acc/pre_train: 0.34000; acc/post_train: 0.37000; nll/pre_train: 0.71773; perplexity/pre_train: 2.04978; nll/post_train: 0.72239; perplexity/post_train: 2.05935; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08835; loss/total_train: 0.00851; loss/total_edit_train: 0.00851; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 124.05943; lr/lr0_train: 0.00016; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:13:02,436][trainer][INFO] - Step 9200:
[2023-03-25 15:13:02,436][trainer][INFO] - loss/edit_train: 0.05865; loss/loc_train: 0.00161; edit/acc_train: 1.00000; edit/log_prob_train: -0.05865; edit/prob_train: 0.94654; acc/pre_train: 0.28000; acc/post_train: 0.45000; nll/pre_train: 0.72331; perplexity/pre_train: 2.06125; nll/post_train: 0.69944; perplexity/post_train: 2.01262; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08821; loss/total_train: 0.00747; loss/total_edit_train: 0.00747; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 74.22927; lr/lr0_train: 0.00016; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:13:19,214][trainer][INFO] - Step 9300:
[2023-03-25 15:13:19,214][trainer][INFO] - loss/edit_train: 0.04832; loss/loc_train: 0.00550; edit/acc_train: 1.00000; edit/log_prob_train: -0.04832; edit/prob_train: 0.95556; acc/pre_train: 0.50000; acc/post_train: 0.56000; nll/pre_train: 0.68841; perplexity/pre_train: 1.99055; nll/post_train: 0.71094; perplexity/post_train: 2.03591; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09226; loss/total_train: 0.01033; loss/total_edit_train: 0.01033; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 148.64900; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:13:35,737][trainer][INFO] - Step 9400:
[2023-03-25 15:13:35,737][trainer][INFO] - loss/edit_train: 0.02846; loss/loc_train: 0.01847; edit/acc_train: 1.00000; edit/log_prob_train: -0.02846; edit/prob_train: 0.97214; acc/pre_train: 0.50000; acc/post_train: 0.47000; nll/pre_train: 0.70052; perplexity/pre_train: 2.01480; nll/post_train: 0.77415; perplexity/post_train: 2.16875; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08851; loss/total_train: 0.02131; loss/total_edit_train: 0.02131; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 153.61335; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:13:52,159][trainer][INFO] - Step 9500:
[2023-03-25 15:13:52,160][trainer][INFO] - loss/edit_train: 0.04382; loss/loc_train: 0.00568; edit/acc_train: 1.00000; edit/log_prob_train: -0.04382; edit/prob_train: 0.95935; acc/pre_train: 0.43000; acc/post_train: 0.53000; nll/pre_train: 0.69536; perplexity/pre_train: 2.00443; nll/post_train: 0.72674; perplexity/post_train: 2.06833; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08893; loss/total_train: 0.01006; loss/total_edit_train: 0.01006; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 56.79873; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:14:08,956][trainer][INFO] - Step 9600:
[2023-03-25 15:14:08,957][trainer][INFO] - loss/edit_train: 0.03133; loss/loc_train: 0.00116; edit/acc_train: 1.00000; edit/log_prob_train: -0.03133; edit/prob_train: 0.96939; acc/pre_train: 0.53000; acc/post_train: 0.49000; nll/pre_train: 0.68785; perplexity/pre_train: 1.98942; nll/post_train: 0.69122; perplexity/post_train: 1.99615; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09113; loss/total_train: 0.00430; loss/total_edit_train: 0.00430; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 48.43995; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:14:24,951][trainer][INFO] - Step 9700:
[2023-03-25 15:14:24,951][trainer][INFO] - loss/edit_train: 0.05279; loss/loc_train: 0.00675; edit/acc_train: 0.99000; edit/log_prob_train: -0.05279; edit/prob_train: 0.95448; acc/pre_train: 0.42000; acc/post_train: 0.47000; nll/pre_train: 0.70297; perplexity/pre_train: 2.01973; nll/post_train: 0.68622; perplexity/post_train: 1.98620; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08802; loss/total_train: 0.01203; loss/total_edit_train: 0.01203; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 98.07302; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:14:40,625][trainer][INFO] - Step 9800:
[2023-03-25 15:14:40,626][trainer][INFO] - loss/edit_train: 0.03629; loss/loc_train: 0.01068; edit/acc_train: 1.00000; edit/log_prob_train: -0.03629; edit/prob_train: 0.96528; acc/pre_train: 0.49000; acc/post_train: 0.51000; nll/pre_train: 0.68234; perplexity/pre_train: 1.97851; nll/post_train: 0.71489; perplexity/post_train: 2.04396; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08250; loss/total_train: 0.01431; loss/total_edit_train: 0.01431; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 302.72173; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:14:56,142][trainer][INFO] - Step 9900:
[2023-03-25 15:14:56,142][trainer][INFO] - loss/edit_train: 0.05364; loss/loc_train: 0.00128; edit/acc_train: 1.00000; edit/log_prob_train: -0.05364; edit/prob_train: 0.95281; acc/pre_train: 0.47000; acc/post_train: 0.42000; nll/pre_train: 0.68654; perplexity/pre_train: 1.98683; nll/post_train: 0.70215; perplexity/post_train: 2.01808; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08715; loss/total_train: 0.00664; loss/total_edit_train: 0.00664; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 40.18889; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:15:12,571][trainer][INFO] - Step 10000:
[2023-03-25 15:15:12,572][trainer][INFO] - loss/edit_train: 0.03192; loss/loc_train: 0.00603; edit/acc_train: 1.00000; edit/log_prob_train: -0.03192; edit/prob_train: 0.96933; acc/pre_train: 0.46000; acc/post_train: 0.52000; nll/pre_train: 0.68513; perplexity/pre_train: 1.98402; nll/post_train: 0.71301; perplexity/post_train: 2.04012; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09027; loss/total_train: 0.00922; loss/total_edit_train: 0.00922; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 196.47748; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:16:14,627][trainer][INFO] - Step 10000:
[2023-03-25 15:16:14,628][trainer][INFO] - loss/edit_val: 0.03716; loss/loc_val: 0.00662; edit/acc_val: 1.00000; edit/log_prob_val: -0.03716; edit/prob_val: 0.96542; acc/pre_val: 0.47600; acc/post_val: 0.51850; nll/pre_val: 0.69428; perplexity/pre_val: 2.00227; nll/post_val: 0.70168; perplexity/post_val: 2.01714; n_tokens/pre_val: 4.00000; n_tokens/post_val: 4.00000; time/edit_val: 0.08053; loss/total_val: 0.01034; loss/total_edit_val: 0.01034; memory/alloc_max_val: 3409559040.00000; memory/res_max_val: 3818913792.00000; eval_time/elapsed: 62.03087; eval_time/average: 0.12406
[2023-03-25 15:16:14,631][trainer][INFO] - Saving model to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-47-21_2742039763/models/bert-base-multilingual-uncased.2023-03-25_14-47-21_2742039763
[2023-03-25 15:16:14,632][trainer][INFO] - Moving old archive to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-47-21_2742039763/models/bert-base-multilingual-uncased.2023-03-25_14-47-21_2742039763.bk
[2023-03-25 15:16:15,700][trainer][INFO] - Write complete.
[2023-03-25 15:16:31,522][trainer][INFO] - Step 10100:
[2023-03-25 15:16:31,523][trainer][INFO] - loss/edit_train: 0.04422; loss/loc_train: 0.00734; edit/acc_train: 1.00000; edit/log_prob_train: -0.04422; edit/prob_train: 0.95894; acc/pre_train: 0.41000; acc/post_train: 0.48000; nll/pre_train: 0.69988; perplexity/pre_train: 2.01351; nll/post_train: 0.69949; perplexity/post_train: 2.01273; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09218; loss/total_train: 0.01176; loss/total_edit_train: 0.01176; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 80.94399; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:16:47,740][trainer][INFO] - Step 10200:
[2023-03-25 15:16:47,740][trainer][INFO] - loss/edit_train: 0.05264; loss/loc_train: 0.00189; edit/acc_train: 1.00000; edit/log_prob_train: -0.05264; edit/prob_train: 0.95250; acc/pre_train: 0.44000; acc/post_train: 0.49000; nll/pre_train: 0.70409; perplexity/pre_train: 2.02201; nll/post_train: 0.69000; perplexity/post_train: 1.99371; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09518; loss/total_train: 0.00715; loss/total_edit_train: 0.00715; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 34.68915; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00013; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:17:03,857][trainer][INFO] - Step 10300:
[2023-03-25 15:17:03,857][trainer][INFO] - loss/edit_train: 0.03688; loss/loc_train: 0.00262; edit/acc_train: 1.00000; edit/log_prob_train: -0.03688; edit/prob_train: 0.96559; acc/pre_train: 0.39000; acc/post_train: 0.36000; nll/pre_train: 0.70447; perplexity/pre_train: 2.02278; nll/post_train: 0.73161; perplexity/post_train: 2.07843; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09038; loss/total_train: 0.00631; loss/total_edit_train: 0.00631; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 96.75239; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:17:20,394][trainer][INFO] - Step 10400:
[2023-03-25 15:17:20,394][trainer][INFO] - loss/edit_train: 0.03432; loss/loc_train: 0.00797; edit/acc_train: 1.00000; edit/log_prob_train: -0.03432; edit/prob_train: 0.96829; acc/pre_train: 0.54000; acc/post_train: 0.49000; nll/pre_train: 0.68274; perplexity/pre_train: 1.97930; nll/post_train: 0.68354; perplexity/post_train: 1.98089; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09271; loss/total_train: 0.01141; loss/total_edit_train: 0.01141; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 259.48896; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:17:36,536][trainer][INFO] - Step 10500:
[2023-03-25 15:17:36,536][trainer][INFO] - loss/edit_train: 0.04866; loss/loc_train: 0.00439; edit/acc_train: 1.00000; edit/log_prob_train: -0.04866; edit/prob_train: 0.95663; acc/pre_train: 0.41000; acc/post_train: 0.44000; nll/pre_train: 0.69426; perplexity/pre_train: 2.00223; nll/post_train: 0.71997; perplexity/post_train: 2.05437; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08836; loss/total_train: 0.00926; loss/total_edit_train: 0.00926; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 59.36984; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:17:52,628][trainer][INFO] - Step 10600:
[2023-03-25 15:17:52,628][trainer][INFO] - loss/edit_train: 0.04998; loss/loc_train: 0.00595; edit/acc_train: 0.99000; edit/log_prob_train: -0.04998; edit/prob_train: 0.95675; acc/pre_train: 0.43000; acc/post_train: 0.37000; nll/pre_train: 0.69546; perplexity/pre_train: 2.00464; nll/post_train: 0.75041; perplexity/post_train: 2.11786; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08792; loss/total_train: 0.01095; loss/total_edit_train: 0.01095; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 125.78841; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:18:08,563][trainer][INFO] - Step 10700:
[2023-03-25 15:18:08,563][trainer][INFO] - loss/edit_train: 0.05523; loss/loc_train: 0.00314; edit/acc_train: 0.98000; edit/log_prob_train: -0.05523; edit/prob_train: 0.95333; acc/pre_train: 0.46000; acc/post_train: 0.43000; nll/pre_train: 0.69896; perplexity/pre_train: 2.01165; nll/post_train: 0.69080; perplexity/post_train: 1.99531; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08971; loss/total_train: 0.00867; loss/total_edit_train: 0.00867; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 107.32923; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00013; lr/lr3_train: 0.00001; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:18:24,702][trainer][INFO] - Step 10800:
[2023-03-25 15:18:24,702][trainer][INFO] - loss/edit_train: 0.04651; loss/loc_train: 0.00477; edit/acc_train: 0.99000; edit/log_prob_train: -0.04651; edit/prob_train: 0.95848; acc/pre_train: 0.44000; acc/post_train: 0.45000; nll/pre_train: 0.69299; perplexity/pre_train: 1.99969; nll/post_train: 0.73589; perplexity/post_train: 2.08734; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08806; loss/total_train: 0.00942; loss/total_edit_train: 0.00942; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 112.34591; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00013; lr/lr3_train: 0.00000; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:18:40,370][trainer][INFO] - Step 10900:
[2023-03-25 15:18:40,371][trainer][INFO] - loss/edit_train: 0.03937; loss/loc_train: 0.00572; edit/acc_train: 1.00000; edit/log_prob_train: -0.03937; edit/prob_train: 0.96383; acc/pre_train: 0.47000; acc/post_train: 0.49000; nll/pre_train: 0.69647; perplexity/pre_train: 2.00665; nll/post_train: 0.72012; perplexity/post_train: 2.05467; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09170; loss/total_train: 0.00966; loss/total_edit_train: 0.00966; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 90.51527; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00013; lr/lr3_train: 0.00001; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:18:56,069][trainer][INFO] - Step 11000:
[2023-03-25 15:18:56,069][trainer][INFO] - loss/edit_train: 0.03958; loss/loc_train: 0.00178; edit/acc_train: 1.00000; edit/log_prob_train: -0.03958; edit/prob_train: 0.96470; acc/pre_train: 0.49000; acc/post_train: 0.52000; nll/pre_train: 0.69335; perplexity/pre_train: 2.00041; nll/post_train: 0.69530; perplexity/post_train: 2.00432; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08685; loss/total_train: 0.00574; loss/total_edit_train: 0.00574; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 82.38306; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00013; lr/lr3_train: 0.00001; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:19:12,197][trainer][INFO] - Step 11100:
[2023-03-25 15:19:12,198][trainer][INFO] - loss/edit_train: 0.04168; loss/loc_train: 0.00259; edit/acc_train: 1.00000; edit/log_prob_train: -0.04168; edit/prob_train: 0.96237; acc/pre_train: 0.44000; acc/post_train: 0.51000; nll/pre_train: 0.70255; perplexity/pre_train: 2.01890; nll/post_train: 0.71492; perplexity/post_train: 2.04403; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08795; loss/total_train: 0.00676; loss/total_edit_train: 0.00676; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 55.41353; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00013; lr/lr3_train: 0.00001; lr/lr4_train: 0.00023; lr/lr5_train: 0.00021
[2023-03-25 15:19:28,439][trainer][INFO] - Step 11200:
[2023-03-25 15:19:28,439][trainer][INFO] - loss/edit_train: 0.04562; loss/loc_train: 0.00212; edit/acc_train: 1.00000; edit/log_prob_train: -0.04562; edit/prob_train: 0.95899; acc/pre_train: 0.48000; acc/post_train: 0.48000; nll/pre_train: 0.69442; perplexity/pre_train: 2.00256; nll/post_train: 0.69924; perplexity/post_train: 2.01222; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08965; loss/total_train: 0.00669; loss/total_edit_train: 0.00669; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 63.65997; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00013; lr/lr3_train: 0.00001; lr/lr4_train: 0.00023; lr/lr5_train: 0.00021
[2023-03-25 15:19:44,487][trainer][INFO] - Step 11300:
[2023-03-25 15:19:44,488][trainer][INFO] - loss/edit_train: 0.02231; loss/loc_train: 0.00132; edit/acc_train: 1.00000; edit/log_prob_train: -0.02231; edit/prob_train: 0.97800; acc/pre_train: 0.50000; acc/post_train: 0.53000; nll/pre_train: 0.68722; perplexity/pre_train: 1.98819; nll/post_train: 0.68964; perplexity/post_train: 1.99299; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08709; loss/total_train: 0.00356; loss/total_edit_train: 0.00356; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 22.34333; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00013; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00021
[2023-03-25 15:20:00,576][trainer][INFO] - Step 11400:
[2023-03-25 15:20:00,576][trainer][INFO] - loss/edit_train: 0.03071; loss/loc_train: 0.01157; edit/acc_train: 1.00000; edit/log_prob_train: -0.03071; edit/prob_train: 0.97100; acc/pre_train: 0.44000; acc/post_train: 0.44000; nll/pre_train: 0.70201; perplexity/pre_train: 2.01780; nll/post_train: 0.76819; perplexity/post_train: 2.15586; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08942; loss/total_train: 0.01464; loss/total_edit_train: 0.01464; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 110.65252; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00013; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00021
[2023-03-25 15:20:16,909][trainer][INFO] - Step 11500:
[2023-03-25 15:20:16,910][trainer][INFO] - loss/edit_train: 0.04180; loss/loc_train: 0.00408; edit/acc_train: 1.00000; edit/log_prob_train: -0.04180; edit/prob_train: 0.96210; acc/pre_train: 0.47000; acc/post_train: 0.55000; nll/pre_train: 0.68739; perplexity/pre_train: 1.98851; nll/post_train: 0.69156; perplexity/post_train: 1.99684; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08992; loss/total_train: 0.00826; loss/total_edit_train: 0.00826; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 64.03787; lr/lr0_train: 0.00017; lr/lr1_train: 0.00009; lr/lr2_train: 0.00013; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00021
[2023-03-25 15:20:33,030][trainer][INFO] - Step 11600:
[2023-03-25 15:20:33,030][trainer][INFO] - loss/edit_train: 0.03677; loss/loc_train: 0.00472; edit/acc_train: 0.99000; edit/log_prob_train: -0.03677; edit/prob_train: 0.96734; acc/pre_train: 0.41000; acc/post_train: 0.48000; nll/pre_train: 0.71121; perplexity/pre_train: 2.03646; nll/post_train: 0.72909; perplexity/post_train: 2.07320; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08912; loss/total_train: 0.00839; loss/total_edit_train: 0.00839; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 334.70804; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00014; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00021
[2023-03-25 15:20:49,277][trainer][INFO] - Step 11700:
[2023-03-25 15:20:49,277][trainer][INFO] - loss/edit_train: 0.02400; loss/loc_train: 0.00283; edit/acc_train: 1.00000; edit/log_prob_train: -0.02400; edit/prob_train: 0.97646; acc/pre_train: 0.43000; acc/post_train: 0.51000; nll/pre_train: 0.69025; perplexity/pre_train: 1.99421; nll/post_train: 0.70364; perplexity/post_train: 2.02110; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08928; loss/total_train: 0.00523; loss/total_edit_train: 0.00523; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 61.95535; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00014; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00021
[2023-03-25 15:21:05,522][trainer][INFO] - Step 11800:
[2023-03-25 15:21:05,523][trainer][INFO] - loss/edit_train: 0.02211; loss/loc_train: 0.00624; edit/acc_train: 1.00000; edit/log_prob_train: -0.02211; edit/prob_train: 0.97822; acc/pre_train: 0.50000; acc/post_train: 0.52000; nll/pre_train: 0.68600; perplexity/pre_train: 1.98576; nll/post_train: 0.70268; perplexity/post_train: 2.01917; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09095; loss/total_train: 0.00845; loss/total_edit_train: 0.00845; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 72.30096; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00014; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00021
[2023-03-25 15:21:20,837][trainer][INFO] - Step 11900:
[2023-03-25 15:21:20,837][trainer][INFO] - loss/edit_train: 0.04170; loss/loc_train: 0.00998; edit/acc_train: 1.00000; edit/log_prob_train: -0.04170; edit/prob_train: 0.96110; acc/pre_train: 0.48000; acc/post_train: 0.41000; nll/pre_train: 0.70446; perplexity/pre_train: 2.02276; nll/post_train: 0.78196; perplexity/post_train: 2.18574; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08671; loss/total_train: 0.01415; loss/total_edit_train: 0.01415; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 249.22030; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00014; lr/lr3_train: 0.00000; lr/lr4_train: 0.00023; lr/lr5_train: 0.00021
[2023-03-25 15:21:36,760][trainer][INFO] - Step 12000:
[2023-03-25 15:21:36,760][trainer][INFO] - loss/edit_train: 0.03956; loss/loc_train: 0.00604; edit/acc_train: 1.00000; edit/log_prob_train: -0.03956; edit/prob_train: 0.96444; acc/pre_train: 0.46000; acc/post_train: 0.46000; nll/pre_train: 0.69941; perplexity/pre_train: 2.01257; nll/post_train: 0.68537; perplexity/post_train: 1.98451; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08830; loss/total_train: 0.00999; loss/total_edit_train: 0.00999; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 175.55282; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00014; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:21:52,533][trainer][INFO] - Step 12100:
[2023-03-25 15:21:52,534][trainer][INFO] - loss/edit_train: 0.05812; loss/loc_train: 0.00723; edit/acc_train: 1.00000; edit/log_prob_train: -0.05812; edit/prob_train: 0.94517; acc/pre_train: 0.39000; acc/post_train: 0.43000; nll/pre_train: 0.70438; perplexity/pre_train: 2.02260; nll/post_train: 0.74734; perplexity/post_train: 2.11137; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08817; loss/total_train: 0.01305; loss/total_edit_train: 0.01305; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 84.64760; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00014; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00021
[2023-03-25 15:22:08,343][trainer][INFO] - Step 12200:
[2023-03-25 15:22:08,344][trainer][INFO] - loss/edit_train: 0.05123; loss/loc_train: 0.00368; edit/acc_train: 1.00000; edit/log_prob_train: -0.05123; edit/prob_train: 0.95234; acc/pre_train: 0.56000; acc/post_train: 0.42000; nll/pre_train: 0.68907; perplexity/pre_train: 1.99187; nll/post_train: 0.69621; perplexity/post_train: 2.00614; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08973; loss/total_train: 0.00880; loss/total_edit_train: 0.00880; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 40.01402; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00014; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:22:24,275][trainer][INFO] - Step 12300:
[2023-03-25 15:22:24,275][trainer][INFO] - loss/edit_train: 0.03700; loss/loc_train: 0.00936; edit/acc_train: 1.00000; edit/log_prob_train: -0.03700; edit/prob_train: 0.96543; acc/pre_train: 0.50000; acc/post_train: 0.55000; nll/pre_train: 0.69775; perplexity/pre_train: 2.00922; nll/post_train: 0.67216; perplexity/post_train: 1.95846; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08872; loss/total_train: 0.01306; loss/total_edit_train: 0.01306; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 103.44044; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00014; lr/lr3_train: -0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:22:40,076][trainer][INFO] - Step 12400:
[2023-03-25 15:22:40,076][trainer][INFO] - loss/edit_train: 0.03907; loss/loc_train: 0.00839; edit/acc_train: 1.00000; edit/log_prob_train: -0.03907; edit/prob_train: 0.96297; acc/pre_train: 0.55000; acc/post_train: 0.57000; nll/pre_train: 0.70041; perplexity/pre_train: 2.01457; nll/post_train: 0.71718; perplexity/post_train: 2.04865; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09113; loss/total_train: 0.01230; loss/total_edit_train: 0.01230; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 71.77834; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00013; lr/lr3_train: -0.00000; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:22:56,887][trainer][INFO] - Step 12500:
[2023-03-25 15:22:56,887][trainer][INFO] - loss/edit_train: 0.05729; loss/loc_train: 0.00460; edit/acc_train: 0.99000; edit/log_prob_train: -0.05729; edit/prob_train: 0.94862; acc/pre_train: 0.41000; acc/post_train: 0.42000; nll/pre_train: 0.70106; perplexity/pre_train: 2.01588; nll/post_train: 0.71365; perplexity/post_train: 2.04143; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09206; loss/total_train: 0.01033; loss/total_edit_train: 0.01033; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 66.05412; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00014; lr/lr3_train: -0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:23:12,961][trainer][INFO] - Step 12600:
[2023-03-25 15:23:12,961][trainer][INFO] - loss/edit_train: 0.04620; loss/loc_train: 0.00327; edit/acc_train: 0.99000; edit/log_prob_train: -0.04620; edit/prob_train: 0.95902; acc/pre_train: 0.53000; acc/post_train: 0.60000; nll/pre_train: 0.67714; perplexity/pre_train: 1.96825; nll/post_train: 0.67165; perplexity/post_train: 1.95746; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08885; loss/total_train: 0.00789; loss/total_edit_train: 0.00789; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 49.38749; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00014; lr/lr3_train: -0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:23:28,887][trainer][INFO] - Step 12700:
[2023-03-25 15:23:28,887][trainer][INFO] - loss/edit_train: 0.04572; loss/loc_train: 0.01203; edit/acc_train: 1.00000; edit/log_prob_train: -0.04572; edit/prob_train: 0.95765; acc/pre_train: 0.41000; acc/post_train: 0.46000; nll/pre_train: 0.70701; perplexity/pre_train: 2.02791; nll/post_train: 0.74354; perplexity/post_train: 2.10336; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08811; loss/total_train: 0.01660; loss/total_edit_train: 0.01660; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 248.01085; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00013; lr/lr3_train: -0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:23:45,171][trainer][INFO] - Step 12800:
[2023-03-25 15:23:45,172][trainer][INFO] - loss/edit_train: 0.04672; loss/loc_train: 0.00123; edit/acc_train: 0.99000; edit/log_prob_train: -0.04672; edit/prob_train: 0.95814; acc/pre_train: 0.47000; acc/post_train: 0.53000; nll/pre_train: 0.69026; perplexity/pre_train: 1.99423; nll/post_train: 0.68716; perplexity/post_train: 1.98805; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09019; loss/total_train: 0.00590; loss/total_edit_train: 0.00590; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 29.76464; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00013; lr/lr3_train: -0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:24:01,211][trainer][INFO] - Step 12900:
[2023-03-25 15:24:01,211][trainer][INFO] - loss/edit_train: 0.04664; loss/loc_train: 0.00403; edit/acc_train: 0.99000; edit/log_prob_train: -0.04664; edit/prob_train: 0.95812; acc/pre_train: 0.41000; acc/post_train: 0.42000; nll/pre_train: 0.69871; perplexity/pre_train: 2.01116; nll/post_train: 0.70739; perplexity/post_train: 2.02870; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08819; loss/total_train: 0.00870; loss/total_edit_train: 0.00870; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 34.55987; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00013; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:24:17,442][trainer][INFO] - Step 13000:
[2023-03-25 15:24:17,443][trainer][INFO] - loss/edit_train: 0.03832; loss/loc_train: 0.00377; edit/acc_train: 1.00000; edit/log_prob_train: -0.03832; edit/prob_train: 0.96413; acc/pre_train: 0.40000; acc/post_train: 0.47000; nll/pre_train: 0.69232; perplexity/pre_train: 1.99834; nll/post_train: 0.72978; perplexity/post_train: 2.07463; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08904; loss/total_train: 0.00761; loss/total_edit_train: 0.00761; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 224.51723; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00000; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:24:33,926][trainer][INFO] - Step 13100:
[2023-03-25 15:24:33,926][trainer][INFO] - loss/edit_train: 0.06158; loss/loc_train: 0.00612; edit/acc_train: 0.99000; edit/log_prob_train: -0.06158; edit/prob_train: 0.94917; acc/pre_train: 0.33000; acc/post_train: 0.34000; nll/pre_train: 0.72692; perplexity/pre_train: 2.06869; nll/post_train: 0.74507; perplexity/post_train: 2.10659; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09245; loss/total_train: 0.01228; loss/total_edit_train: 0.01228; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 79.18842; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:24:50,346][trainer][INFO] - Step 13200:
[2023-03-25 15:24:50,347][trainer][INFO] - loss/edit_train: 0.06628; loss/loc_train: 0.00180; edit/acc_train: 0.99000; edit/log_prob_train: -0.06628; edit/prob_train: 0.94363; acc/pre_train: 0.48000; acc/post_train: 0.51000; nll/pre_train: 0.68998; perplexity/pre_train: 1.99367; nll/post_train: 0.68382; perplexity/post_train: 1.98144; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09116; loss/total_train: 0.00843; loss/total_edit_train: 0.00843; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 86.26618; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:25:06,267][trainer][INFO] - Step 13300:
[2023-03-25 15:25:06,267][trainer][INFO] - loss/edit_train: 0.05353; loss/loc_train: 0.00113; edit/acc_train: 1.00000; edit/log_prob_train: -0.05353; edit/prob_train: 0.95287; acc/pre_train: 0.52000; acc/post_train: 0.55000; nll/pre_train: 0.69030; perplexity/pre_train: 1.99430; nll/post_train: 0.68385; perplexity/post_train: 1.98149; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08737; loss/total_train: 0.00648; loss/total_edit_train: 0.00648; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 44.50162; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:25:22,447][trainer][INFO] - Step 13400:
[2023-03-25 15:25:22,447][trainer][INFO] - loss/edit_train: 0.03549; loss/loc_train: 0.00410; edit/acc_train: 1.00000; edit/log_prob_train: -0.03549; edit/prob_train: 0.96732; acc/pre_train: 0.46000; acc/post_train: 0.42000; nll/pre_train: 0.69963; perplexity/pre_train: 2.01300; nll/post_train: 0.70432; perplexity/post_train: 2.02247; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08848; loss/total_train: 0.00764; loss/total_edit_train: 0.00764; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 159.55622; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:25:38,634][trainer][INFO] - Step 13500:
[2023-03-25 15:25:38,634][trainer][INFO] - loss/edit_train: 0.03375; loss/loc_train: 0.00762; edit/acc_train: 1.00000; edit/log_prob_train: -0.03375; edit/prob_train: 0.96890; acc/pre_train: 0.47000; acc/post_train: 0.51000; nll/pre_train: 0.68756; perplexity/pre_train: 1.98885; nll/post_train: 0.69976; perplexity/post_train: 2.01328; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08971; loss/total_train: 0.01100; loss/total_edit_train: 0.01100; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 234.84408; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:25:54,341][trainer][INFO] - Step 13600:
[2023-03-25 15:25:54,342][trainer][INFO] - loss/edit_train: 0.04891; loss/loc_train: 0.00531; edit/acc_train: 0.99000; edit/log_prob_train: -0.04891; edit/prob_train: 0.95774; acc/pre_train: 0.40000; acc/post_train: 0.44000; nll/pre_train: 0.70529; perplexity/pre_train: 2.02444; nll/post_train: 0.69122; perplexity/post_train: 1.99616; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08744; loss/total_train: 0.01020; loss/total_edit_train: 0.01020; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 33.33874; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:26:10,634][trainer][INFO] - Step 13700:
[2023-03-25 15:26:10,635][trainer][INFO] - loss/edit_train: 0.03968; loss/loc_train: 0.00692; edit/acc_train: 1.00000; edit/log_prob_train: -0.03968; edit/prob_train: 0.96408; acc/pre_train: 0.36000; acc/post_train: 0.54000; nll/pre_train: 0.70945; perplexity/pre_train: 2.03286; nll/post_train: 0.67864; perplexity/post_train: 1.97120; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09173; loss/total_train: 0.01089; loss/total_edit_train: 0.01089; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 161.01304; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:26:26,741][trainer][INFO] - Step 13800:
[2023-03-25 15:26:26,742][trainer][INFO] - loss/edit_train: 0.03088; loss/loc_train: 0.00205; edit/acc_train: 1.00000; edit/log_prob_train: -0.03088; edit/prob_train: 0.97119; acc/pre_train: 0.39000; acc/post_train: 0.38000; nll/pre_train: 0.70909; perplexity/pre_train: 2.03215; nll/post_train: 0.72572; perplexity/post_train: 2.06622; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08914; loss/total_train: 0.00514; loss/total_edit_train: 0.00514; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 53.98399; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00001; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:26:42,700][trainer][INFO] - Step 13900:
[2023-03-25 15:26:42,701][trainer][INFO] - loss/edit_train: 0.03679; loss/loc_train: 0.00138; edit/acc_train: 0.99000; edit/log_prob_train: -0.03679; edit/prob_train: 0.96678; acc/pre_train: 0.49000; acc/post_train: 0.52000; nll/pre_train: 0.69389; perplexity/pre_train: 2.00148; nll/post_train: 0.69043; perplexity/post_train: 1.99457; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09147; loss/total_train: 0.00506; loss/total_edit_train: 0.00506; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 64.52513; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00002; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:26:58,872][trainer][INFO] - Step 14000:
[2023-03-25 15:26:58,872][trainer][INFO] - loss/edit_train: 0.04109; loss/loc_train: 0.00181; edit/acc_train: 0.99000; edit/log_prob_train: -0.04109; edit/prob_train: 0.96351; acc/pre_train: 0.45000; acc/post_train: 0.54000; nll/pre_train: 0.70030; perplexity/pre_train: 2.01435; nll/post_train: 0.68669; perplexity/post_train: 1.98712; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09159; loss/total_train: 0.00592; loss/total_edit_train: 0.00592; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 83.87904; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00002; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:27:15,004][trainer][INFO] - Step 14100:
[2023-03-25 15:27:15,005][trainer][INFO] - loss/edit_train: 0.04072; loss/loc_train: 0.00694; edit/acc_train: 1.00000; edit/log_prob_train: -0.04072; edit/prob_train: 0.96289; acc/pre_train: 0.43000; acc/post_train: 0.50000; nll/pre_train: 0.69135; perplexity/pre_train: 1.99642; nll/post_train: 0.72524; perplexity/post_train: 2.06523; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08798; loss/total_train: 0.01101; loss/total_edit_train: 0.01101; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 120.46229; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:27:32,088][trainer][INFO] - Step 14200:
[2023-03-25 15:27:32,089][trainer][INFO] - loss/edit_train: 0.03401; loss/loc_train: 0.00337; edit/acc_train: 1.00000; edit/log_prob_train: -0.03401; edit/prob_train: 0.96928; acc/pre_train: 0.40000; acc/post_train: 0.46000; nll/pre_train: 0.71190; perplexity/pre_train: 2.03785; nll/post_train: 0.71845; perplexity/post_train: 2.05125; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09534; loss/total_train: 0.00677; loss/total_edit_train: 0.00677; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 89.62649; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00012; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:27:48,422][trainer][INFO] - Step 14300:
[2023-03-25 15:27:48,423][trainer][INFO] - loss/edit_train: 0.04103; loss/loc_train: 0.01556; edit/acc_train: 1.00000; edit/log_prob_train: -0.04103; edit/prob_train: 0.96161; acc/pre_train: 0.49000; acc/post_train: 0.51000; nll/pre_train: 0.69133; perplexity/pre_train: 1.99636; nll/post_train: 0.75052; perplexity/post_train: 2.11810; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08910; loss/total_train: 0.01966; loss/total_edit_train: 0.01966; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 204.94101; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00001; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:28:04,270][trainer][INFO] - Step 14400:
[2023-03-25 15:28:04,271][trainer][INFO] - loss/edit_train: 0.05702; loss/loc_train: 0.00442; edit/acc_train: 0.98000; edit/log_prob_train: -0.05702; edit/prob_train: 0.95124; acc/pre_train: 0.41000; acc/post_train: 0.50000; nll/pre_train: 0.70537; perplexity/pre_train: 2.02459; nll/post_train: 0.71468; perplexity/post_train: 2.04353; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08636; loss/total_train: 0.01012; loss/total_edit_train: 0.01012; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 97.77272; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:28:20,384][trainer][INFO] - Step 14500:
[2023-03-25 15:28:20,384][trainer][INFO] - loss/edit_train: 0.06541; loss/loc_train: 0.00270; edit/acc_train: 0.98000; edit/log_prob_train: -0.06541; edit/prob_train: 0.94544; acc/pre_train: 0.43000; acc/post_train: 0.53000; nll/pre_train: 0.70219; perplexity/pre_train: 2.01816; nll/post_train: 0.70393; perplexity/post_train: 2.02168; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08873; loss/total_train: 0.00925; loss/total_edit_train: 0.00925; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 68.77533; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:28:36,586][trainer][INFO] - Step 14600:
[2023-03-25 15:28:36,586][trainer][INFO] - loss/edit_train: 0.02759; loss/loc_train: 0.00493; edit/acc_train: 1.00000; edit/log_prob_train: -0.02759; edit/prob_train: 0.97312; acc/pre_train: 0.51000; acc/post_train: 0.50000; nll/pre_train: 0.69229; perplexity/pre_train: 1.99828; nll/post_train: 0.70868; perplexity/post_train: 2.03132; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08965; loss/total_train: 0.00769; loss/total_edit_train: 0.00769; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 134.74182; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:28:52,886][trainer][INFO] - Step 14700:
[2023-03-25 15:28:52,887][trainer][INFO] - loss/edit_train: 0.03811; loss/loc_train: 0.00457; edit/acc_train: 0.99000; edit/log_prob_train: -0.03811; edit/prob_train: 0.96563; acc/pre_train: 0.48000; acc/post_train: 0.51000; nll/pre_train: 0.68715; perplexity/pre_train: 1.98805; nll/post_train: 0.72495; perplexity/post_train: 2.06462; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08968; loss/total_train: 0.00838; loss/total_edit_train: 0.00838; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 90.48621; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00022; lr/lr5_train: 0.00019
[2023-03-25 15:29:08,951][trainer][INFO] - Step 14800:
[2023-03-25 15:29:08,951][trainer][INFO] - loss/edit_train: 0.05102; loss/loc_train: 0.00451; edit/acc_train: 1.00000; edit/log_prob_train: -0.05102; edit/prob_train: 0.95472; acc/pre_train: 0.47000; acc/post_train: 0.55000; nll/pre_train: 0.69692; perplexity/pre_train: 2.00756; nll/post_train: 0.68535; perplexity/post_train: 1.98447; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08973; loss/total_train: 0.00961; loss/total_edit_train: 0.00961; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 42.15559; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00022; lr/lr5_train: 0.00020
[2023-03-25 15:29:25,161][trainer][INFO] - Step 14900:
[2023-03-25 15:29:25,161][trainer][INFO] - loss/edit_train: 0.03086; loss/loc_train: 0.00303; edit/acc_train: 1.00000; edit/log_prob_train: -0.03086; edit/prob_train: 0.97133; acc/pre_train: 0.43000; acc/post_train: 0.50000; nll/pre_train: 0.69789; perplexity/pre_train: 2.00950; nll/post_train: 0.72500; perplexity/post_train: 2.06473; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08881; loss/total_train: 0.00612; loss/total_edit_train: 0.00612; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 100.76037; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:29:41,921][trainer][INFO] - Step 15000:
[2023-03-25 15:29:41,921][trainer][INFO] - loss/edit_train: 0.03662; loss/loc_train: 0.00253; edit/acc_train: 0.99000; edit/log_prob_train: -0.03662; edit/prob_train: 0.96653; acc/pre_train: 0.45000; acc/post_train: 0.54000; nll/pre_train: 0.69379; perplexity/pre_train: 2.00129; nll/post_train: 0.71150; perplexity/post_train: 2.03705; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09095; loss/total_train: 0.00619; loss/total_edit_train: 0.00619; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 297.36154; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:30:44,096][trainer][INFO] - Step 15000:
[2023-03-25 15:30:44,097][trainer][INFO] - loss/edit_val: 0.03644; loss/loc_val: 0.00387; edit/acc_val: 0.99800; edit/log_prob_val: -0.03644; edit/prob_val: 0.96706; acc/pre_val: 0.47600; acc/post_val: 0.49950; nll/pre_val: 0.69428; perplexity/pre_val: 2.00227; nll/post_val: 0.70219; perplexity/post_val: 2.01817; n_tokens/pre_val: 4.00000; n_tokens/post_val: 4.00000; time/edit_val: 0.08315; loss/total_val: 0.00751; loss/total_edit_val: 0.00751; memory/alloc_max_val: 3409559040.00000; memory/res_max_val: 3818913792.00000; eval_time/elapsed: 62.13462; eval_time/average: 0.12427
[2023-03-25 15:30:44,101][trainer][INFO] - Saving model to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-47-21_2742039763/models/bert-base-multilingual-uncased.2023-03-25_14-47-21_2742039763
[2023-03-25 15:30:44,101][trainer][INFO] - Moving old archive to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-47-21_2742039763/models/bert-base-multilingual-uncased.2023-03-25_14-47-21_2742039763.bk
[2023-03-25 15:30:45,173][trainer][INFO] - Write complete.
[2023-03-25 15:31:01,490][trainer][INFO] - Step 15100:
[2023-03-25 15:31:01,490][trainer][INFO] - loss/edit_train: 0.05128; loss/loc_train: 0.00496; edit/acc_train: 0.99000; edit/log_prob_train: -0.05128; edit/prob_train: 0.95463; acc/pre_train: 0.46000; acc/post_train: 0.44000; nll/pre_train: 0.68583; perplexity/pre_train: 1.98542; nll/post_train: 0.73096; perplexity/post_train: 2.07708; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09141; loss/total_train: 0.01009; loss/total_edit_train: 0.01009; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 78.62788; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:31:17,140][trainer][INFO] - Step 15200:
[2023-03-25 15:31:17,141][trainer][INFO] - loss/edit_train: 0.02990; loss/loc_train: 0.00124; edit/acc_train: 1.00000; edit/log_prob_train: -0.02990; edit/prob_train: 0.97194; acc/pre_train: 0.37000; acc/post_train: 0.52000; nll/pre_train: 0.70208; perplexity/pre_train: 2.01794; nll/post_train: 0.69494; perplexity/post_train: 2.00360; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09210; loss/total_train: 0.00423; loss/total_edit_train: 0.00423; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 30.76798; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:31:32,514][trainer][INFO] - Step 15300:
[2023-03-25 15:31:32,514][trainer][INFO] - loss/edit_train: 0.03756; loss/loc_train: 0.01390; edit/acc_train: 0.99000; edit/log_prob_train: -0.03756; edit/prob_train: 0.96600; acc/pre_train: 0.38000; acc/post_train: 0.45000; nll/pre_train: 0.71198; perplexity/pre_train: 2.03803; nll/post_train: 0.70746; perplexity/post_train: 2.02884; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08826; loss/total_train: 0.01766; loss/total_edit_train: 0.01766; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 144.25045; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:31:48,455][trainer][INFO] - Step 15400:
[2023-03-25 15:31:48,455][trainer][INFO] - loss/edit_train: 0.02173; loss/loc_train: 0.00108; edit/acc_train: 1.00000; edit/log_prob_train: -0.02173; edit/prob_train: 0.97863; acc/pre_train: 0.50000; acc/post_train: 0.47000; nll/pre_train: 0.69170; perplexity/pre_train: 1.99710; nll/post_train: 0.69997; perplexity/post_train: 2.01368; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09254; loss/total_train: 0.00325; loss/total_edit_train: 0.00325; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 10.97231; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:32:04,371][trainer][INFO] - Step 15500:
[2023-03-25 15:32:04,371][trainer][INFO] - loss/edit_train: 0.05031; loss/loc_train: 0.00272; edit/acc_train: 0.98000; edit/log_prob_train: -0.05031; edit/prob_train: 0.95759; acc/pre_train: 0.44000; acc/post_train: 0.47000; nll/pre_train: 0.70315; perplexity/pre_train: 2.02011; nll/post_train: 0.72327; perplexity/post_train: 2.06117; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08928; loss/total_train: 0.00775; loss/total_edit_train: 0.00775; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 147.41004; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:32:20,665][trainer][INFO] - Step 15600:
[2023-03-25 15:32:20,665][trainer][INFO] - loss/edit_train: 0.06437; loss/loc_train: 0.00171; edit/acc_train: 0.98000; edit/log_prob_train: -0.06437; edit/prob_train: 0.94665; acc/pre_train: 0.38000; acc/post_train: 0.43000; nll/pre_train: 0.71278; perplexity/pre_train: 2.03965; nll/post_train: 0.69521; perplexity/post_train: 2.00413; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08853; loss/total_train: 0.00815; loss/total_edit_train: 0.00815; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 128.92287; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:32:37,062][trainer][INFO] - Step 15700:
[2023-03-25 15:32:37,063][trainer][INFO] - loss/edit_train: 0.03450; loss/loc_train: 0.00120; edit/acc_train: 1.00000; edit/log_prob_train: -0.03450; edit/prob_train: 0.96787; acc/pre_train: 0.49000; acc/post_train: 0.54000; nll/pre_train: 0.69425; perplexity/pre_train: 2.00222; nll/post_train: 0.68939; perplexity/post_train: 1.99250; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09111; loss/total_train: 0.00465; loss/total_edit_train: 0.00465; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 66.74008; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:32:53,611][trainer][INFO] - Step 15800:
[2023-03-25 15:32:53,612][trainer][INFO] - loss/edit_train: 0.04197; loss/loc_train: 0.00234; edit/acc_train: 1.00000; edit/log_prob_train: -0.04197; edit/prob_train: 0.96304; acc/pre_train: 0.50000; acc/post_train: 0.54000; nll/pre_train: 0.68616; perplexity/pre_train: 1.98607; nll/post_train: 0.70126; perplexity/post_train: 2.01629; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09082; loss/total_train: 0.00653; loss/total_edit_train: 0.00653; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 70.14719; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:33:09,978][trainer][INFO] - Step 15900:
[2023-03-25 15:33:09,979][trainer][INFO] - loss/edit_train: 0.02968; loss/loc_train: 0.00125; edit/acc_train: 1.00000; edit/log_prob_train: -0.02968; edit/prob_train: 0.97232; acc/pre_train: 0.39000; acc/post_train: 0.45000; nll/pre_train: 0.71449; perplexity/pre_train: 2.04314; nll/post_train: 0.70892; perplexity/post_train: 2.03179; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09144; loss/total_train: 0.00422; loss/total_edit_train: 0.00422; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 16.85276; lr/lr0_train: 0.00018; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:33:26,473][trainer][INFO] - Step 16000:
[2023-03-25 15:33:26,474][trainer][INFO] - loss/edit_train: 0.05020; loss/loc_train: 0.00326; edit/acc_train: 1.00000; edit/log_prob_train: -0.05020; edit/prob_train: 0.95677; acc/pre_train: 0.44000; acc/post_train: 0.46000; nll/pre_train: 0.69990; perplexity/pre_train: 2.01355; nll/post_train: 0.70242; perplexity/post_train: 2.01864; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08959; loss/total_train: 0.00828; loss/total_edit_train: 0.00828; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 94.47577; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:33:43,644][trainer][INFO] - Step 16100:
[2023-03-25 15:33:43,645][trainer][INFO] - loss/edit_train: 0.03720; loss/loc_train: 0.00162; edit/acc_train: 0.99000; edit/log_prob_train: -0.03720; edit/prob_train: 0.96706; acc/pre_train: 0.52000; acc/post_train: 0.52000; nll/pre_train: 0.67861; perplexity/pre_train: 1.97114; nll/post_train: 0.67240; perplexity/post_train: 1.95893; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09670; loss/total_train: 0.00534; loss/total_edit_train: 0.00534; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 69.74575; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:34:00,775][trainer][INFO] - Step 16200:
[2023-03-25 15:34:00,776][trainer][INFO] - loss/edit_train: 0.03590; loss/loc_train: 0.00358; edit/acc_train: 0.99000; edit/log_prob_train: -0.03590; edit/prob_train: 0.96851; acc/pre_train: 0.46000; acc/post_train: 0.46000; nll/pre_train: 0.69592; perplexity/pre_train: 2.00555; nll/post_train: 0.69268; perplexity/post_train: 1.99907; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09193; loss/total_train: 0.00717; loss/total_edit_train: 0.00717; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 116.32262; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:34:18,094][trainer][INFO] - Step 16300:
[2023-03-25 15:34:18,094][trainer][INFO] - loss/edit_train: 0.02180; loss/loc_train: 0.00485; edit/acc_train: 1.00000; edit/log_prob_train: -0.02180; edit/prob_train: 0.97860; acc/pre_train: 0.41000; acc/post_train: 0.42000; nll/pre_train: 0.70549; perplexity/pre_train: 2.02484; nll/post_train: 0.73480; perplexity/post_train: 2.08506; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09511; loss/total_train: 0.00703; loss/total_edit_train: 0.00703; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 68.66523; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:34:35,067][trainer][INFO] - Step 16400:
[2023-03-25 15:34:35,067][trainer][INFO] - loss/edit_train: 0.02494; loss/loc_train: 0.00580; edit/acc_train: 1.00000; edit/log_prob_train: -0.02494; edit/prob_train: 0.97622; acc/pre_train: 0.54000; acc/post_train: 0.48000; nll/pre_train: 0.67862; perplexity/pre_train: 1.97116; nll/post_train: 0.72133; perplexity/post_train: 2.05717; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09445; loss/total_train: 0.00830; loss/total_edit_train: 0.00830; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 16.05149; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:34:52,444][trainer][INFO] - Step 16500:
[2023-03-25 15:34:52,444][trainer][INFO] - loss/edit_train: 0.01936; loss/loc_train: 0.01074; edit/acc_train: 1.00000; edit/log_prob_train: -0.01936; edit/prob_train: 0.98093; acc/pre_train: 0.47000; acc/post_train: 0.52000; nll/pre_train: 0.68354; perplexity/pre_train: 1.98088; nll/post_train: 0.67727; perplexity/post_train: 1.96851; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09484; loss/total_train: 0.01267; loss/total_edit_train: 0.01267; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 88.52785; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:35:09,403][trainer][INFO] - Step 16600:
[2023-03-25 15:35:09,403][trainer][INFO] - loss/edit_train: 0.02966; loss/loc_train: 0.00636; edit/acc_train: 1.00000; edit/log_prob_train: -0.02966; edit/prob_train: 0.97237; acc/pre_train: 0.41000; acc/post_train: 0.45000; nll/pre_train: 0.70067; perplexity/pre_train: 2.01509; nll/post_train: 0.72514; perplexity/post_train: 2.06503; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09521; loss/total_train: 0.00933; loss/total_edit_train: 0.00933; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 58.74573; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:35:26,049][trainer][INFO] - Step 16700:
[2023-03-25 15:35:26,050][trainer][INFO] - loss/edit_train: 0.02972; loss/loc_train: 0.00428; edit/acc_train: 1.00000; edit/log_prob_train: -0.02972; edit/prob_train: 0.97197; acc/pre_train: 0.45000; acc/post_train: 0.52000; nll/pre_train: 0.69561; perplexity/pre_train: 2.00493; nll/post_train: 0.72263; perplexity/post_train: 2.05985; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09189; loss/total_train: 0.00725; loss/total_edit_train: 0.00725; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 79.59533; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:35:43,011][trainer][INFO] - Step 16800:
[2023-03-25 15:35:43,011][trainer][INFO] - loss/edit_train: 0.02450; loss/loc_train: 0.00417; edit/acc_train: 1.00000; edit/log_prob_train: -0.02450; edit/prob_train: 0.97663; acc/pre_train: 0.43000; acc/post_train: 0.51000; nll/pre_train: 0.69389; perplexity/pre_train: 2.00149; nll/post_train: 0.69141; perplexity/post_train: 1.99652; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09528; loss/total_train: 0.00662; loss/total_edit_train: 0.00662; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 217.57475; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:35:59,007][trainer][INFO] - Step 16900:
[2023-03-25 15:35:59,007][trainer][INFO] - loss/edit_train: 0.03306; loss/loc_train: 0.00160; edit/acc_train: 1.00000; edit/log_prob_train: -0.03306; edit/prob_train: 0.96993; acc/pre_train: 0.46000; acc/post_train: 0.48000; nll/pre_train: 0.69167; perplexity/pre_train: 1.99704; nll/post_train: 0.69142; perplexity/post_train: 1.99654; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08793; loss/total_train: 0.00491; loss/total_edit_train: 0.00491; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 58.01499; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:36:15,538][trainer][INFO] - Step 17000:
[2023-03-25 15:36:15,539][trainer][INFO] - loss/edit_train: 0.03578; loss/loc_train: 0.00432; edit/acc_train: 1.00000; edit/log_prob_train: -0.03578; edit/prob_train: 0.96749; acc/pre_train: 0.47000; acc/post_train: 0.51000; nll/pre_train: 0.69850; perplexity/pre_train: 2.01074; nll/post_train: 0.68175; perplexity/post_train: 1.97734; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09246; loss/total_train: 0.00790; loss/total_edit_train: 0.00790; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 100.64711; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:36:31,736][trainer][INFO] - Step 17100:
[2023-03-25 15:36:31,736][trainer][INFO] - loss/edit_train: 0.02473; loss/loc_train: 0.00194; edit/acc_train: 1.00000; edit/log_prob_train: -0.02473; edit/prob_train: 0.97710; acc/pre_train: 0.41000; acc/post_train: 0.40000; nll/pre_train: 0.69355; perplexity/pre_train: 2.00081; nll/post_train: 0.71438; perplexity/post_train: 2.04292; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08945; loss/total_train: 0.00441; loss/total_edit_train: 0.00441; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 74.53020; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:36:47,712][trainer][INFO] - Step 17200:
[2023-03-25 15:36:47,713][trainer][INFO] - loss/edit_train: 0.04835; loss/loc_train: 0.00119; edit/acc_train: 0.99000; edit/log_prob_train: -0.04835; edit/prob_train: 0.95930; acc/pre_train: 0.44000; acc/post_train: 0.46000; nll/pre_train: 0.70201; perplexity/pre_train: 2.01780; nll/post_train: 0.69872; perplexity/post_train: 2.01118; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08719; loss/total_train: 0.00602; loss/total_edit_train: 0.00602; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 54.61927; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:37:04,443][trainer][INFO] - Step 17300:
[2023-03-25 15:37:04,444][trainer][INFO] - loss/edit_train: 0.04148; loss/loc_train: 0.00093; edit/acc_train: 0.99000; edit/log_prob_train: -0.04148; edit/prob_train: 0.96472; acc/pre_train: 0.48000; acc/post_train: 0.54000; nll/pre_train: 0.68629; perplexity/pre_train: 1.98633; nll/post_train: 0.68458; perplexity/post_train: 1.98295; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09236; loss/total_train: 0.00508; loss/total_edit_train: 0.00508; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 32.86532; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:37:22,325][trainer][INFO] - Step 17400:
[2023-03-25 15:37:22,325][trainer][INFO] - loss/edit_train: 0.01654; loss/loc_train: 0.00147; edit/acc_train: 1.00000; edit/log_prob_train: -0.01654; edit/prob_train: 0.98360; acc/pre_train: 0.39000; acc/post_train: 0.45000; nll/pre_train: 0.70453; perplexity/pre_train: 2.02289; nll/post_train: 0.71258; perplexity/post_train: 2.03924; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09835; loss/total_train: 0.00312; loss/total_edit_train: 0.00312; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 21.26267; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:37:39,615][trainer][INFO] - Step 17500:
[2023-03-25 15:37:39,616][trainer][INFO] - loss/edit_train: 0.04384; loss/loc_train: 0.00626; edit/acc_train: 0.99000; edit/log_prob_train: -0.04384; edit/prob_train: 0.96254; acc/pre_train: 0.46000; acc/post_train: 0.43000; nll/pre_train: 0.69809; perplexity/pre_train: 2.00991; nll/post_train: 0.72691; perplexity/post_train: 2.06868; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09455; loss/total_train: 0.01064; loss/total_edit_train: 0.01064; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 409.74801; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:37:57,327][trainer][INFO] - Step 17600:
[2023-03-25 15:37:57,328][trainer][INFO] - loss/edit_train: 0.03104; loss/loc_train: 0.00105; edit/acc_train: 1.00000; edit/log_prob_train: -0.03104; edit/prob_train: 0.97241; acc/pre_train: 0.48000; acc/post_train: 0.58000; nll/pre_train: 0.68578; perplexity/pre_train: 1.98531; nll/post_train: 0.67398; perplexity/post_train: 1.96203; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09787; loss/total_train: 0.00415; loss/total_edit_train: 0.00415; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 16.11599; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:38:14,733][trainer][INFO] - Step 17700:
[2023-03-25 15:38:14,734][trainer][INFO] - loss/edit_train: 0.03663; loss/loc_train: 0.00553; edit/acc_train: 0.99000; edit/log_prob_train: -0.03663; edit/prob_train: 0.96778; acc/pre_train: 0.39000; acc/post_train: 0.43000; nll/pre_train: 0.71203; perplexity/pre_train: 2.03813; nll/post_train: 0.74748; perplexity/post_train: 2.11168; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09531; loss/total_train: 0.00919; loss/total_edit_train: 0.00919; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 88.19327; lr/lr0_train: 0.00019; lr/lr1_train: 0.00009; lr/lr2_train: 0.00010; lr/lr3_train: 0.00003; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:38:31,492][trainer][INFO] - Step 17800:
[2023-03-25 15:38:31,493][trainer][INFO] - loss/edit_train: 0.02202; loss/loc_train: 0.00131; edit/acc_train: 1.00000; edit/log_prob_train: -0.02202; edit/prob_train: 0.97864; acc/pre_train: 0.45000; acc/post_train: 0.41000; nll/pre_train: 0.70320; perplexity/pre_train: 2.02021; nll/post_train: 0.70375; perplexity/post_train: 2.02132; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09206; loss/total_train: 0.00351; loss/total_edit_train: 0.00351; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 15.07629; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00003; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:38:49,310][trainer][INFO] - Step 17900:
[2023-03-25 15:38:49,311][trainer][INFO] - loss/edit_train: 0.01831; loss/loc_train: 0.00594; edit/acc_train: 1.00000; edit/log_prob_train: -0.01831; edit/prob_train: 0.98190; acc/pre_train: 0.41000; acc/post_train: 0.51000; nll/pre_train: 0.69884; perplexity/pre_train: 2.01142; nll/post_train: 0.72268; perplexity/post_train: 2.05995; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09698; loss/total_train: 0.00777; loss/total_edit_train: 0.00777; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 58.28037; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00003; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:39:06,815][trainer][INFO] - Step 18000:
[2023-03-25 15:39:06,815][trainer][INFO] - loss/edit_train: 0.02542; loss/loc_train: 0.00136; edit/acc_train: 0.99000; edit/log_prob_train: -0.02542; edit/prob_train: 0.97717; acc/pre_train: 0.43000; acc/post_train: 0.50000; nll/pre_train: 0.71322; perplexity/pre_train: 2.04055; nll/post_train: 0.68825; perplexity/post_train: 1.99023; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09468; loss/total_train: 0.00390; loss/total_edit_train: 0.00390; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 25.41937; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00003; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:39:23,803][trainer][INFO] - Step 18100:
[2023-03-25 15:39:23,803][trainer][INFO] - loss/edit_train: 0.02918; loss/loc_train: 0.00818; edit/acc_train: 0.99000; edit/log_prob_train: -0.02918; edit/prob_train: 0.97372; acc/pre_train: 0.42000; acc/post_train: 0.48000; nll/pre_train: 0.70483; perplexity/pre_train: 2.02350; nll/post_train: 0.71569; perplexity/post_train: 2.04560; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09072; loss/total_train: 0.01110; loss/total_edit_train: 0.01110; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 128.57945; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00003; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:39:40,931][trainer][INFO] - Step 18200:
[2023-03-25 15:39:40,932][trainer][INFO] - loss/edit_train: 0.02637; loss/loc_train: 0.00933; edit/acc_train: 1.00000; edit/log_prob_train: -0.02637; edit/prob_train: 0.97526; acc/pre_train: 0.47000; acc/post_train: 0.53000; nll/pre_train: 0.70017; perplexity/pre_train: 2.01409; nll/post_train: 0.67508; perplexity/post_train: 1.96419; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09303; loss/total_train: 0.01197; loss/total_edit_train: 0.01197; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 332.07697; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00010; lr/lr3_train: 0.00003; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:39:59,080][trainer][INFO] - Step 18300:
[2023-03-25 15:39:59,081][trainer][INFO] - loss/edit_train: 0.03141; loss/loc_train: 0.00493; edit/acc_train: 1.00000; edit/log_prob_train: -0.03141; edit/prob_train: 0.97133; acc/pre_train: 0.43000; acc/post_train: 0.47000; nll/pre_train: 0.68298; perplexity/pre_train: 1.97977; nll/post_train: 0.71251; perplexity/post_train: 2.03911; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09981; loss/total_train: 0.00807; loss/total_edit_train: 0.00807; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 50.75814; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00003; lr/lr4_train: 0.00023; lr/lr5_train: 0.00020
[2023-03-25 15:40:16,584][trainer][INFO] - Step 18400:
[2023-03-25 15:40:16,584][trainer][INFO] - loss/edit_train: 0.02207; loss/loc_train: 0.00516; edit/acc_train: 1.00000; edit/log_prob_train: -0.02207; edit/prob_train: 0.97851; acc/pre_train: 0.48000; acc/post_train: 0.52000; nll/pre_train: 0.67997; perplexity/pre_train: 1.97383; nll/post_train: 0.69092; perplexity/post_train: 1.99555; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09545; loss/total_train: 0.00737; loss/total_edit_train: 0.00737; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 52.11170; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00003; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:40:33,681][trainer][INFO] - Step 18500:
[2023-03-25 15:40:33,682][trainer][INFO] - loss/edit_train: 0.02022; loss/loc_train: 0.00170; edit/acc_train: 1.00000; edit/log_prob_train: -0.02022; edit/prob_train: 0.98037; acc/pre_train: 0.50000; acc/post_train: 0.53000; nll/pre_train: 0.68814; perplexity/pre_train: 1.99001; nll/post_train: 0.67724; perplexity/post_train: 1.96844; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09263; loss/total_train: 0.00373; loss/total_edit_train: 0.00373; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 55.96213; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:40:51,260][trainer][INFO] - Step 18600:
[2023-03-25 15:40:51,261][trainer][INFO] - loss/edit_train: 0.02236; loss/loc_train: 0.00230; edit/acc_train: 1.00000; edit/log_prob_train: -0.02236; edit/prob_train: 0.97876; acc/pre_train: 0.45000; acc/post_train: 0.52000; nll/pre_train: 0.68862; perplexity/pre_train: 1.99096; nll/post_train: 0.69089; perplexity/post_train: 1.99550; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09683; loss/total_train: 0.00453; loss/total_edit_train: 0.00453; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 121.17274; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:41:08,217][trainer][INFO] - Step 18700:
[2023-03-25 15:41:08,217][trainer][INFO] - loss/edit_train: 0.03248; loss/loc_train: 0.00492; edit/acc_train: 1.00000; edit/log_prob_train: -0.03248; edit/prob_train: 0.97112; acc/pre_train: 0.50000; acc/post_train: 0.49000; nll/pre_train: 0.69846; perplexity/pre_train: 2.01065; nll/post_train: 0.73075; perplexity/post_train: 2.07664; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09420; loss/total_train: 0.00817; loss/total_edit_train: 0.00817; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 58.68461; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:41:25,596][trainer][INFO] - Step 18800:
[2023-03-25 15:41:25,596][trainer][INFO] - loss/edit_train: 0.03711; loss/loc_train: 0.00179; edit/acc_train: 1.00000; edit/log_prob_train: -0.03711; edit/prob_train: 0.96693; acc/pre_train: 0.40000; acc/post_train: 0.50000; nll/pre_train: 0.69448; perplexity/pre_train: 2.00266; nll/post_train: 0.68585; perplexity/post_train: 1.98546; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09917; loss/total_train: 0.00550; loss/total_edit_train: 0.00550; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 41.68066; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:41:42,399][trainer][INFO] - Step 18900:
[2023-03-25 15:41:42,399][trainer][INFO] - loss/edit_train: 0.03541; loss/loc_train: 0.00114; edit/acc_train: 0.99000; edit/log_prob_train: -0.03541; edit/prob_train: 0.96920; acc/pre_train: 0.40000; acc/post_train: 0.53000; nll/pre_train: 0.70133; perplexity/pre_train: 2.01644; nll/post_train: 0.69761; perplexity/post_train: 2.00895; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09212; loss/total_train: 0.00468; loss/total_edit_train: 0.00468; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 47.23983; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:41:59,339][trainer][INFO] - Step 19000:
[2023-03-25 15:41:59,340][trainer][INFO] - loss/edit_train: 0.02338; loss/loc_train: 0.00181; edit/acc_train: 1.00000; edit/log_prob_train: -0.02338; edit/prob_train: 0.97800; acc/pre_train: 0.47000; acc/post_train: 0.52000; nll/pre_train: 0.69560; perplexity/pre_train: 2.00491; nll/post_train: 0.68312; perplexity/post_train: 1.98004; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09230; loss/total_train: 0.00415; loss/total_edit_train: 0.00415; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 93.58658; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:42:16,462][trainer][INFO] - Step 19100:
[2023-03-25 15:42:16,463][trainer][INFO] - loss/edit_train: 0.02521; loss/loc_train: 0.00270; edit/acc_train: 1.00000; edit/log_prob_train: -0.02521; edit/prob_train: 0.97671; acc/pre_train: 0.38000; acc/post_train: 0.51000; nll/pre_train: 0.70082; perplexity/pre_train: 2.01541; nll/post_train: 0.69870; perplexity/post_train: 2.01114; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09522; loss/total_train: 0.00522; loss/total_edit_train: 0.00522; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 109.83227; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:42:33,403][trainer][INFO] - Step 19200:
[2023-03-25 15:42:33,404][trainer][INFO] - loss/edit_train: 0.01883; loss/loc_train: 0.01188; edit/acc_train: 1.00000; edit/log_prob_train: -0.01883; edit/prob_train: 0.98153; acc/pre_train: 0.32000; acc/post_train: 0.37000; nll/pre_train: 0.71616; perplexity/pre_train: 2.04656; nll/post_train: 0.76043; perplexity/post_train: 2.13919; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09378; loss/total_train: 0.01376; loss/total_edit_train: 0.01376; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 144.53092; lr/lr0_train: 0.00018; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:42:50,316][trainer][INFO] - Step 19300:
[2023-03-25 15:42:50,316][trainer][INFO] - loss/edit_train: 0.02418; loss/loc_train: 0.00128; edit/acc_train: 1.00000; edit/log_prob_train: -0.02418; edit/prob_train: 0.97712; acc/pre_train: 0.37000; acc/post_train: 0.38000; nll/pre_train: 0.71343; perplexity/pre_train: 2.04099; nll/post_train: 0.71745; perplexity/post_train: 2.04920; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09199; loss/total_train: 0.00369; loss/total_edit_train: 0.00369; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 30.67569; lr/lr0_train: 0.00018; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:43:07,427][trainer][INFO] - Step 19400:
[2023-03-25 15:43:07,428][trainer][INFO] - loss/edit_train: 0.01922; loss/loc_train: 0.00764; edit/acc_train: 1.00000; edit/log_prob_train: -0.01922; edit/prob_train: 0.98109; acc/pre_train: 0.46000; acc/post_train: 0.51000; nll/pre_train: 0.69527; perplexity/pre_train: 2.00425; nll/post_train: 0.73305; perplexity/post_train: 2.08143; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09328; loss/total_train: 0.00956; loss/total_edit_train: 0.00956; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 150.58012; lr/lr0_train: 0.00018; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:43:24,151][trainer][INFO] - Step 19500:
[2023-03-25 15:43:24,151][trainer][INFO] - loss/edit_train: 0.02245; loss/loc_train: 0.00119; edit/acc_train: 1.00000; edit/log_prob_train: -0.02245; edit/prob_train: 0.97896; acc/pre_train: 0.42000; acc/post_train: 0.54000; nll/pre_train: 0.69865; perplexity/pre_train: 2.01103; nll/post_train: 0.68927; perplexity/post_train: 1.99227; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09184; loss/total_train: 0.00343; loss/total_edit_train: 0.00343; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 19.19414; lr/lr0_train: 0.00018; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00019
[2023-03-25 15:43:41,497][trainer][INFO] - Step 19600:
[2023-03-25 15:43:41,498][trainer][INFO] - loss/edit_train: 0.02374; loss/loc_train: 0.00510; edit/acc_train: 1.00000; edit/log_prob_train: -0.02374; edit/prob_train: 0.97793; acc/pre_train: 0.39000; acc/post_train: 0.52000; nll/pre_train: 0.70675; perplexity/pre_train: 2.02738; nll/post_train: 0.73118; perplexity/post_train: 2.07753; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09203; loss/total_train: 0.00747; loss/total_edit_train: 0.00747; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 38.88471; lr/lr0_train: 0.00019; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00019
[2023-03-25 15:43:58,118][trainer][INFO] - Step 19700:
[2023-03-25 15:43:58,119][trainer][INFO] - loss/edit_train: 0.01647; loss/loc_train: 0.00229; edit/acc_train: 1.00000; edit/log_prob_train: -0.01647; edit/prob_train: 0.98368; acc/pre_train: 0.45000; acc/post_train: 0.44000; nll/pre_train: 0.70649; perplexity/pre_train: 2.02686; nll/post_train: 0.69590; perplexity/post_train: 2.00551; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09154; loss/total_train: 0.00394; loss/total_edit_train: 0.00394; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 73.89807; lr/lr0_train: 0.00019; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00019
[2023-03-25 15:44:14,658][trainer][INFO] - Step 19800:
[2023-03-25 15:44:14,659][trainer][INFO] - loss/edit_train: 0.02153; loss/loc_train: 0.00759; edit/acc_train: 1.00000; edit/log_prob_train: -0.02153; edit/prob_train: 0.97888; acc/pre_train: 0.48000; acc/post_train: 0.55000; nll/pre_train: 0.69407; perplexity/pre_train: 2.00185; nll/post_train: 0.73518; perplexity/post_train: 2.08587; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09098; loss/total_train: 0.00974; loss/total_edit_train: 0.00974; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 116.77741; lr/lr0_train: 0.00018; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00019
[2023-03-25 15:44:31,488][trainer][INFO] - Step 19900:
[2023-03-25 15:44:31,488][trainer][INFO] - loss/edit_train: 0.02999; loss/loc_train: 0.00599; edit/acc_train: 1.00000; edit/log_prob_train: -0.02999; edit/prob_train: 0.97160; acc/pre_train: 0.39000; acc/post_train: 0.51000; nll/pre_train: 0.69365; perplexity/pre_train: 2.00100; nll/post_train: 0.72901; perplexity/post_train: 2.07303; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09229; loss/total_train: 0.00899; loss/total_edit_train: 0.00899; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 75.41480; lr/lr0_train: 0.00018; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00019
[2023-03-25 15:44:48,604][trainer][INFO] - Step 20000:
[2023-03-25 15:44:48,605][trainer][INFO] - loss/edit_train: 0.02836; loss/loc_train: 0.00109; edit/acc_train: 1.00000; edit/log_prob_train: -0.02836; edit/prob_train: 0.97289; acc/pre_train: 0.40000; acc/post_train: 0.45000; nll/pre_train: 0.70317; perplexity/pre_train: 2.02014; nll/post_train: 0.70677; perplexity/post_train: 2.02743; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09267; loss/total_train: 0.00392; loss/total_edit_train: 0.00392; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 33.97721; lr/lr0_train: 0.00019; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00023; lr/lr5_train: 0.00019
[2023-03-25 15:45:50,652][trainer][INFO] - Step 20000:
[2023-03-25 15:45:50,652][trainer][INFO] - loss/edit_val: 0.02422; loss/loc_val: 0.00463; edit/acc_val: 1.00000; edit/log_prob_val: -0.02422; edit/prob_val: 0.97737; acc/pre_val: 0.47600; acc/post_val: 0.48500; nll/pre_val: 0.69428; perplexity/pre_val: 2.00227; nll/post_val: 0.70791; perplexity/post_val: 2.02974; n_tokens/pre_val: 4.00000; n_tokens/post_val: 4.00000; time/edit_val: 0.08234; loss/total_val: 0.00705; loss/total_edit_val: 0.00705; memory/alloc_max_val: 3409559040.00000; memory/res_max_val: 3818913792.00000; eval_time/elapsed: 62.01880; eval_time/average: 0.12404
[2023-03-25 15:45:50,656][trainer][INFO] - Saving model to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-47-21_2742039763/models/bert-base-multilingual-uncased.2023-03-25_14-47-21_2742039763
[2023-03-25 15:45:50,656][trainer][INFO] - Moving old archive to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-47-21_2742039763/models/bert-base-multilingual-uncased.2023-03-25_14-47-21_2742039763.bk
[2023-03-25 15:45:51,751][trainer][INFO] - Write complete.
[2023-03-25 15:46:08,240][trainer][INFO] - Step 20100:
[2023-03-25 15:46:08,241][trainer][INFO] - loss/edit_train: 0.01841; loss/loc_train: 0.00494; edit/acc_train: 1.00000; edit/log_prob_train: -0.01841; edit/prob_train: 0.98181; acc/pre_train: 0.45000; acc/post_train: 0.56000; nll/pre_train: 0.69712; perplexity/pre_train: 2.00796; nll/post_train: 0.72233; perplexity/post_train: 2.05923; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08884; loss/total_train: 0.00678; loss/total_edit_train: 0.00678; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 60.01323; lr/lr0_train: 0.00019; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00019
[2023-03-25 15:46:24,937][trainer][INFO] - Step 20200:
[2023-03-25 15:46:24,937][trainer][INFO] - loss/edit_train: 0.02009; loss/loc_train: 0.00639; edit/acc_train: 1.00000; edit/log_prob_train: -0.02009; edit/prob_train: 0.98038; acc/pre_train: 0.48000; acc/post_train: 0.47000; nll/pre_train: 0.69964; perplexity/pre_train: 2.01303; nll/post_train: 0.73667; perplexity/post_train: 2.08897; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09097; loss/total_train: 0.00840; loss/total_edit_train: 0.00840; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 88.91865; lr/lr0_train: 0.00019; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:46:41,898][trainer][INFO] - Step 20300:
[2023-03-25 15:46:41,899][trainer][INFO] - loss/edit_train: 0.02367; loss/loc_train: 0.00474; edit/acc_train: 1.00000; edit/log_prob_train: -0.02367; edit/prob_train: 0.97807; acc/pre_train: 0.39000; acc/post_train: 0.46000; nll/pre_train: 0.69884; perplexity/pre_train: 2.01143; nll/post_train: 0.69200; perplexity/post_train: 1.99771; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09374; loss/total_train: 0.00711; loss/total_edit_train: 0.00711; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 89.20937; lr/lr0_train: 0.00019; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:46:58,344][trainer][INFO] - Step 20400:
[2023-03-25 15:46:58,344][trainer][INFO] - loss/edit_train: 0.02759; loss/loc_train: 0.00088; edit/acc_train: 1.00000; edit/log_prob_train: -0.02759; edit/prob_train: 0.97438; acc/pre_train: 0.42000; acc/post_train: 0.54000; nll/pre_train: 0.70229; perplexity/pre_train: 2.01838; nll/post_train: 0.68989; perplexity/post_train: 1.99350; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08989; loss/total_train: 0.00364; loss/total_edit_train: 0.00364; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 33.20675; lr/lr0_train: 0.00019; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:47:15,414][trainer][INFO] - Step 20500:
[2023-03-25 15:47:15,414][trainer][INFO] - loss/edit_train: 0.01609; loss/loc_train: 0.00537; edit/acc_train: 1.00000; edit/log_prob_train: -0.01609; edit/prob_train: 0.98407; acc/pre_train: 0.40000; acc/post_train: 0.49000; nll/pre_train: 0.69285; perplexity/pre_train: 1.99941; nll/post_train: 0.71528; perplexity/post_train: 2.04475; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09419; loss/total_train: 0.00698; loss/total_edit_train: 0.00698; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 92.59606; lr/lr0_train: 0.00019; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00020
[2023-03-25 15:47:31,733][trainer][INFO] - Step 20600:
[2023-03-25 15:47:31,733][trainer][INFO] - loss/edit_train: 0.02880; loss/loc_train: 0.01321; edit/acc_train: 1.00000; edit/log_prob_train: -0.02880; edit/prob_train: 0.97488; acc/pre_train: 0.42000; acc/post_train: 0.53000; nll/pre_train: 0.71463; perplexity/pre_train: 2.04343; nll/post_train: 0.74782; perplexity/post_train: 2.11238; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09383; loss/total_train: 0.01609; loss/total_edit_train: 0.01609; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 249.18674; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00019
[2023-03-25 15:47:47,703][trainer][INFO] - Step 20700:
[2023-03-25 15:47:47,703][trainer][INFO] - loss/edit_train: 0.04513; loss/loc_train: 0.00055; edit/acc_train: 0.98000; edit/log_prob_train: -0.04513; edit/prob_train: 0.96115; acc/pre_train: 0.53000; acc/post_train: 0.56000; nll/pre_train: 0.68445; perplexity/pre_train: 1.98269; nll/post_train: 0.67995; perplexity/post_train: 1.97377; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08587; loss/total_train: 0.00507; loss/total_edit_train: 0.00507; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 61.20170; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00019
[2023-03-25 15:48:03,948][trainer][INFO] - Step 20800:
[2023-03-25 15:48:03,949][trainer][INFO] - loss/edit_train: 0.02633; loss/loc_train: 0.00610; edit/acc_train: 0.99000; edit/log_prob_train: -0.02633; edit/prob_train: 0.97606; acc/pre_train: 0.48000; acc/post_train: 0.49000; nll/pre_train: 0.70716; perplexity/pre_train: 2.02822; nll/post_train: 0.69063; perplexity/post_train: 1.99497; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09207; loss/total_train: 0.00873; loss/total_edit_train: 0.00873; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 38.00766; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00019
[2023-03-25 15:48:19,898][trainer][INFO] - Step 20900:
[2023-03-25 15:48:19,899][trainer][INFO] - loss/edit_train: 0.01734; loss/loc_train: 0.00457; edit/acc_train: 1.00000; edit/log_prob_train: -0.01734; edit/prob_train: 0.98289; acc/pre_train: 0.37000; acc/post_train: 0.39000; nll/pre_train: 0.70156; perplexity/pre_train: 2.01690; nll/post_train: 0.68602; perplexity/post_train: 1.98580; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08992; loss/total_train: 0.00631; loss/total_edit_train: 0.00631; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 136.55847; lr/lr0_train: 0.00019; lr/lr1_train: 0.00008; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00019
[2023-03-25 15:48:36,569][trainer][INFO] - Step 21000:
[2023-03-25 15:48:36,569][trainer][INFO] - loss/edit_train: 0.02032; loss/loc_train: 0.00124; edit/acc_train: 1.00000; edit/log_prob_train: -0.02032; edit/prob_train: 0.98073; acc/pre_train: 0.38000; acc/post_train: 0.41000; nll/pre_train: 0.71814; perplexity/pre_train: 2.05062; nll/post_train: 0.69591; perplexity/post_train: 2.00553; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09057; loss/total_train: 0.00327; loss/total_edit_train: 0.00327; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 121.84148; lr/lr0_train: 0.00019; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00019
[2023-03-25 15:48:52,849][trainer][INFO] - Step 21100:
[2023-03-25 15:48:52,849][trainer][INFO] - loss/edit_train: 0.02830; loss/loc_train: 0.00100; edit/acc_train: 1.00000; edit/log_prob_train: -0.02830; edit/prob_train: 0.97451; acc/pre_train: 0.48000; acc/post_train: 0.53000; nll/pre_train: 0.69255; perplexity/pre_train: 1.99881; nll/post_train: 0.69121; perplexity/post_train: 1.99614; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08764; loss/total_train: 0.00383; loss/total_edit_train: 0.00383; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 24.41252; lr/lr0_train: 0.00019; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00019
[2023-03-25 15:49:09,038][trainer][INFO] - Step 21200:
[2023-03-25 15:49:09,039][trainer][INFO] - loss/edit_train: 0.01567; loss/loc_train: 0.00144; edit/acc_train: 1.00000; edit/log_prob_train: -0.01567; edit/prob_train: 0.98447; acc/pre_train: 0.43000; acc/post_train: 0.49000; nll/pre_train: 0.69917; perplexity/pre_train: 2.01207; nll/post_train: 0.70004; perplexity/post_train: 2.01384; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08861; loss/total_train: 0.00301; loss/total_edit_train: 0.00301; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 60.85003; lr/lr0_train: 0.00019; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00019
[2023-03-25 15:49:25,645][trainer][INFO] - Step 21300:
[2023-03-25 15:49:25,645][trainer][INFO] - loss/edit_train: 0.01942; loss/loc_train: 0.00154; edit/acc_train: 1.00000; edit/log_prob_train: -0.01942; edit/prob_train: 0.98144; acc/pre_train: 0.44000; acc/post_train: 0.47000; nll/pre_train: 0.69455; perplexity/pre_train: 2.00280; nll/post_train: 0.69294; perplexity/post_train: 1.99959; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09180; loss/total_train: 0.00348; loss/total_edit_train: 0.00348; memory/alloc_max_train: 3409559040.00000; memory/res_max_train: 3818913792.00000; grad_train: 64.22324; lr/lr0_train: 0.00019; lr/lr1_train: 0.00007; lr/lr2_train: 0.00011; lr/lr3_train: 0.00002; lr/lr4_train: 0.00024; lr/lr5_train: 0.00019