Skip to content

Commit

Permalink
Optimize validation scoring (#2266)
Browse files Browse the repository at this point in the history
* sources and refs tokens are recovered with vocab.lookup_index
* tests with dynamic scoring and copy are reactivated
  • Loading branch information
l-k-11235 authored Dec 7, 2022
1 parent cadd99c commit 70799ae
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 5,939 deletions.
16 changes: 10 additions & 6 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -159,16 +159,16 @@ jobs:
-accum_count 2 4 8 \
-accum_steps 0 15000 30000 \
-save_model /tmp/onmt.model \
-train_steps 200 \
-report_every 50 \
-train_eval_steps 100 \
-train_steps 20 \
-report_every 5 \
-train_eval_steps 10 \
-train_metrics "BLEU" "TER" \
-tensorboard "true" \
-scoring_debug "true" \
-tensorboard_log_dir /tmp/logs_train_metrics \
-dump_preds /tmp/dump_preds
python onmt/tests/test_events.py --logdir /tmp/logs_train_metrics -tensorboard_checks train_metrics
- name : Test Transformer training and validation with dynamic scoring
- name : Test Transformer training and validation with dynamic scoring and copy
run: |
python3 train.py \
-config data/data.yaml \
Expand All @@ -190,11 +190,15 @@ jobs:
-save_model /tmp/onmt.model \
-train_steps 10 -valid_steps 5 \
-report_every 2 \
-train_eval_steps 8 \
-train_metrics "BLEU" "TER" \
-valid_metrics "BLEU" "TER" \
-tensorboard "true" \
-scoring_debug "true" \
-tensorboard_log_dir /tmp/logs_train_valid_metrics \
-dump_preds /tmp/dump_preds
# python onmt/tests/test_events.py --logdir /tmp/logs_train_valid_metrics -tensorboard_checks train_valid_metrics
-dump_preds /tmp/dump_preds \
-copy_attn
python onmt/tests/test_events.py --logdir /tmp/logs_train_valid_metrics -tensorboard_checks train_valid_metrics
- name: Test LM training
run: |
python train.py \
Expand Down
2,950 changes: 0 additions & 2,950 deletions data/src-val.txt

Large diffs are not rendered by default.

2,950 changes: 0 additions & 2,950 deletions data/tgt-val.txt

Large diffs are not rendered by default.

17 changes: 10 additions & 7 deletions onmt/tests/pull_request_chk.sh
Original file line number Diff line number Diff line change
Expand Up @@ -183,13 +183,12 @@ ${PYTHON} onmt/bin/train.py \
-layers 4 \
-word_vec_size 16 \
-hidden_size 16 \
-num_workers 0 -bucket_size 1024 \
-heads 2 \
-transformer_ff 64 \
-num_workers 0 -bucket_size 1024 \
-train_steps 200 \
-report_every 50 \
-train_eval_steps 100 \
-train_steps 20 \
-report_every 5 \
-train_eval_steps 10 \
-train_metrics "BLEU" "TER" \
-tensorboard "true" \
-scoring_debug "true" \
Expand All @@ -200,7 +199,7 @@ ${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_train_metrics -te
echo "Succeeded" | tee -a ${LOG_FILE}
rm -r $TMP_OUT_DIR/logs_train_metrics

echo -n " [+] Testing NMT training w/ dynamic scoring with validation ..."
echo -n " [+] Testing NMT training w/ dynamic scoring with validation and copy ..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
Expand All @@ -215,14 +214,18 @@ ${PYTHON} onmt/bin/train.py \
-num_workers 0 -bucket_size 1024 \
-heads 2 \
-transformer_ff 64 \
-num_workers 0 -bucket_size 1024 \
-bucket_size 1024 \
-train_steps 10 \
-report_every 2 \
-train_eval_steps 8 -valid_steps 5 \
-train_metrics "BLEU" "TER" \
-valid_metrics "BLEU" "TER" \
-tensorboard "true" \
-scoring_debug "true" \
-dump_preds $TMP_OUT_DIR/dump_pred \
-copy_attn \
-tensorboard_log_dir $TMP_OUT_DIR/logs_train_valid_metrics >> ${LOG_FILE} 2>&1
#${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_train_valid_metrics -tensorboard_checks train_valid_metrics
${PYTHON} onmt/tests/test_events.py --logdir $TMP_OUT_DIR/logs_train_valid_metrics -tensorboard_checks train_valid_metrics
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm -r $TMP_OUT_DIR/logs_train_valid_metrics
Expand Down
3 changes: 3 additions & 0 deletions onmt/train_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ def _get_model_opts(opt, checkpoint=None):

def _build_valid_iter(opt, transforms_cls, vocabs):
"""Build iterator used for validation."""
validset_transforms = opt.data.get("valid", {}).get("transforms", None)
if validset_transforms:
opt.tansforms = validset_transforms
valid_iter = build_dynamic_dataset_iter(
opt, transforms_cls, vocabs, task=CorpusTask.VALID,
copy=opt.copy_attn)
Expand Down
26 changes: 15 additions & 11 deletions onmt/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
users of this library) for the strategy things we do.
"""

import time
import torch
import traceback
import onmt.utils
Expand Down Expand Up @@ -325,14 +326,16 @@ def validate(self, valid_iter, moving_average=None):
refs = []
with torch.no_grad():
stats = onmt.utils.Statistics()
start = time.time()
for batch in valid_iter:
src = batch['src']
src_len = batch['srclen']
tgt = batch['tgt']
sources_, refs_ = self.scoring_preparator.\
build_sources_and_refs(batch)
sources.append(sources_)
refs += refs_
if self.valid_scorers:
sources_, refs_ = self.scoring_preparator.\
build_sources_and_refs(batch)
sources.append(sources_)
refs += refs_
with torch.cuda.amp.autocast(enabled=self.optim.amp):
# F-prop through the model.
model_out, attns = valid_model(src, tgt, src_len,
Expand All @@ -342,17 +345,22 @@ def validate(self, valid_iter, moving_average=None):
_, batch_stats = self.valid_loss(batch, model_out, attns)

stats.update(batch_stats)
logger.info("""valid stats calculation and batchs detokenization
took: {} s.""".format(time.time() - start))

# Compute validation metrics (at batch.dataset level)
if len(self.valid_scorers) > 0:
computed_metrics = {}
start = time.time()
preds, texts_ref = self.scoring_preparator.translate(
model=self.model,
sources=sources,
refs=refs,
gpu_rank=self.gpu_rank,
step=self.optim.training_step,
mode="valid")
logger.info("""The translation of the valid dataset
took : {} s.""".format(time.time() - start))
for i, metric in enumerate(self.valid_scorers):
logger.info("UPDATING VALIDATION {}".format(metric))
self.valid_scorers[
Expand All @@ -367,16 +375,12 @@ def validate(self, valid_iter, moving_average=None):
metric, self.valid_scorers[metric]["value"])
)
# Compute stats
batch_stats = onmt.utils.Statistics(
batch_stats.loss,
batch_stats.n_batchs,
batch_stats.n_sents,
batch_stats.n_words,
batch_stats.n_correct,
metric_stats = onmt.utils.Statistics(
0, 0, 0, 0, 0,
computed_metrics)

# Update statistics.
stats.update(batch_stats)
stats.update(metric_stats)

if moving_average:
for param_data, param in zip(model_params_data,
Expand Down
43 changes: 28 additions & 15 deletions onmt/utils/scoring_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from onmt.translate import GNMTGlobalScorer, Translator
from onmt.opts import translate_opts
from onmt.constants import DefaultTokens
from onmt.inputters.text_utils import textbatch_to_tensor
from onmt.inputters.text_utils import _addcopykeys, tensorify, text_sort_key
from onmt.inputters.inputter import IterOnDevice


Expand Down Expand Up @@ -78,19 +78,18 @@ def tokenize_batch(self, batch_side, side):
tokenized_sentences (list): List of lists of tokens.
Each list is a tokenized sentence.
"""
# batch_side.shape[0] sentences to rebuild
# batch_side.shape[1] tokens per sentence
vocab = self.vocabs[side]
nb_sentences = batch_side.shape[0]
nb_tokens_per_sentence = batch_side.shape[1]
indices_to_remove = [vocab.lookup_token(token)
for token in [DefaultTokens.PAD,
DefaultTokens.EOS,
DefaultTokens.BOS]]
tokenized_sentences = []
for i in range(batch_side.shape[0]):
tokens = []
for t in range(batch_side.shape[1]):
token = vocab.ids_to_tokens[batch_side[i, t, 0]]
if (token == DefaultTokens.PAD
or token == DefaultTokens.EOS):
break
if token != DefaultTokens.BOS:
tokens.append(token)
for i in range(nb_sentences):
tokens = [vocab.lookup_index(batch_side[i, t, 0])
for t in range(nb_tokens_per_sentence)
if batch_side[i, t, 0] not in indices_to_remove]
tokenized_sentences.append(tokens)
return tokenized_sentences

Expand Down Expand Up @@ -142,15 +141,29 @@ def translate(self, model, sources, refs, gpu_rank, step, mode):
global_scorer=scorer,
out_file=out_file,
report_align=opt.report_align,
report_score=True,
report_score=False,
logger=None)
preds = []
for sources_ in sources:
# for validation we build an infer_iter per batch
# in order to avoid oom issues because there is no
# batching strategy in `textbatch_to_tensor`
infer_iter = textbatch_to_tensor(translator.vocabs,
sources_, is_train=True)
numeric = []
for i, ex in enumerate(sources_):
if isinstance(ex, bytes):
ex = ex.decode("utf-8")
idxs = translator.vocabs['src'](ex)
num_ex = {'src': {'src': " ".join(ex),
'src_ids': idxs},
'srclen': len(ex),
'tgt': None,
'indices': i,
'align': None}
num_ex = _addcopykeys(translator.vocabs["src"], num_ex)
num_ex["src"]["src"] = ex
numeric.append(num_ex)
numeric.sort(key=text_sort_key, reverse=True)
infer_iter = [tensorify(self.vocabs, numeric)]
infer_iter = IterOnDevice(infer_iter, opt.gpu)
_, preds_ = translator._translate(
infer_iter)
Expand Down

0 comments on commit 70799ae

Please sign in to comment.