force not to decode unk token

may- · may- · commit 2e65c81bf2a3 · 2021-09-22T20:40:17.000+02:00
diff --git a/joeynmt/helpers_for_audio.py b/joeynmt/helpers_for_audio.py
@@ -4,7 +4,6 @@
 """
 
 import io
-import os
 from pathlib import Path
 import sys
 from typing import List, Optional, Tuple, Union
@@ -24,7 +23,7 @@
                     if unicodedata.category(chr(i)).startswith('P')}
 def remove_punc(sent: str) -> str:
     """Remove punctuation based on Unicode category.
-    Note: punctuations in audio transcription are often removed.
+    Note: punctuations in audio transcription are sometimes removed.
 
     :param sent: sentence string
     """
@@ -36,7 +35,7 @@ def __init__(self, fbank_path: str, n_frames: int, idx: Union[int, str]):
         """Speech Instance
 
         :param fbank_path: (str) Feature file path in the format either of
-            "<zip path>:<byte offset>:<byte length>" or "<file name>.mp3"
+            "<zip path>:<byte offset>:<byte length>" or "<file name>.{mp3|wav}"
         :param n_frames: (int) number of frames
         :param idx: index
         """
@@ -69,17 +68,11 @@ def _get_torchaudio_fbank(waveform: torch.FloatTensor, sample_rate: int,
 # from fairseq
 def extract_fbank_features(waveform: torch.FloatTensor,
                            sample_rate: int,
-                           n_frames: int,
-                           utt_id: str,
-                           feature_root: Optional[Path] = None,
+                           output_path: Optional[Path] = None,
                            n_mel_bins: int = 80,
                            overwrite: bool = False) -> Optional[np.ndarray]:
     # pylint: disable=inconsistent-return-statements
 
-    output_path = None
-    if feature_root is not None:
-        output_path = feature_root / f"{utt_id}.npy"
-
     if output_path is not None and output_path.is_file() and not overwrite:
         return
 
@@ -88,10 +81,9 @@ def extract_fbank_features(waveform: torch.FloatTensor,
 
     try:
         features = _get_torchaudio_fbank(_waveform, sample_rate, n_mel_bins)
-        assert abs(features.shape[0] - n_frames) <= 1, (n_frames, features.shape)
     except Exception as e:
         raise ValueError(f"torchaudio faild to extract mel filterbank features "
-                         f"at utt_id: {utt_id}. {e}")
+                         f"at: {output_path.stem}. {e}")
 
     if output_path is not None:
         np.save(output_path.as_posix(), features)
@@ -137,8 +129,7 @@ def get_features(root_path: Path, fbank_path: str) -> np.ndarray:
             features = np.load(_path.as_posix())
         elif _path.suffix in [".mp3", ".wav"]:
             waveform, sample_rate = torchaudio.load(_path.as_posix())
-            num_frames = get_n_frames(waveform.size(1), sample_rate)
-            features = extract_fbank_features(waveform, sample_rate, num_frames)
+            features = extract_fbank_features(waveform, sample_rate, utt_id=None)
         else:
             raise ValueError(f"Invalid file type: {_path}")
     elif len(extra) == 2:
diff --git a/joeynmt/model.py b/joeynmt/model.py
@@ -57,6 +57,7 @@ def __init__(self,
         self.pad_index = self.trg_vocab.pad_index
         self.bos_index = self.trg_vocab.bos_index
         self.eos_index = self.trg_vocab.eos_index
+        self.unk_index = self.trg_vocab.unk_index
         self._loss_function = None # set by the TrainManager
 
     @property
diff --git a/joeynmt/prediction.py b/joeynmt/prediction.py
@@ -119,8 +119,8 @@ def validate_on_data(model: Model,
                     return_type="loss", **vars(batch))
             if n_gpu > 1:
                 batch_loss = batch_loss.sum() # sum on multi-gpu
-                nll_loss = nll_loss.sum()
-                ctc_loss = ctc_loss.sum()
+                nll_loss = nll_loss.sum() if torch.is_tensor(nll_loss) else None
+                ctc_loss = ctc_loss.sum() if torch.is_tensor(nll_loss) else None
                 n_correct = n_correct.float().sum()
             total_loss['loss'] += batch_loss.item() # float
             if torch.is_tensor(nll_loss): # nll_loss is not None
@@ -135,7 +135,7 @@ def validate_on_data(model: Model,
         output, attention_scores = run_batch(
             model=model, batch=batch, beam_size=beam_size,
             beam_alpha=beam_alpha, max_output_length=max_output_length,
-            n_best=n_best)
+            n_best=n_best, generate_unk=False)
 
         # sort outputs back to original order
         all_outputs.extend(output[sort_reverse_index])
@@ -146,8 +146,10 @@ def validate_on_data(model: Model,
     if compute_loss and total_ntokens > 0:
         total_normalizer = 1 if total_normalizer == 0 else total_normalizer
         valid_scores['loss'] = total_loss['loss'] / total_normalizer
-        valid_scores['nll_loss'] = total_loss['nll_loss'] / total_normalizer
-        valid_scores['ctc_loss'] = total_loss['ctc_loss'] / total_normalizer
+        if 'nll_loss' in total_loss:
+            valid_scores['nll_loss'] = total_loss['nll_loss'] / total_normalizer
+        if 'ctc_loss' in total_loss:
+            valid_scores['ctc_loss'] = total_loss['ctc_loss'] / total_normalizer
         # accuracy before decoding
         valid_scores['acc'] = total_n_correct / total_ntokens
         # exponent of token-level negative log prob
diff --git a/joeynmt/search.py b/joeynmt/search.py
@@ -19,8 +19,8 @@
 
 
 def greedy(src_mask: Tensor, max_output_length: int, model: Model,
-           encoder_output: Tensor, encoder_hidden: Tensor) \
-        -> Tuple[np.array, np.array]:
+           encoder_output: Tensor, encoder_hidden: Tensor,
+           generate_unk: bool = False) -> Tuple[np.array, np.array]:
     """
     Greedy decoding. Select the token word highest probability at each time
     step. This function is a wrapper that calls recurrent_greedy for
@@ -31,7 +31,11 @@ def greedy(src_mask: Tensor, max_output_length: int, model: Model,
     :param model: model to use for greedy decoding
     :param encoder_output: encoder hidden states for attention
     :param encoder_hidden: encoder last state for decoder initialization
+    :param generate_unk: whether to generate UNK token. if folse,
+            the probability of UNK token will artificially be set to zero.
     :return:
+        - stacked_output: output hypotheses (2d array of indices),
+        - stacked_attention_scores: attention scores (3d array)
     """
     # pylint: disable=no-else-return
     if isinstance(model.decoder, TransformerDecoder):
@@ -47,7 +51,8 @@ def greedy(src_mask: Tensor, max_output_length: int, model: Model,
 
 
 def recurrent_greedy(src_mask: Tensor, max_output_length: int, model: Model,
-                     encoder_output: Tensor, encoder_hidden: Tensor) \
+                     encoder_output: Tensor, encoder_hidden: Tensor,
+                     generate_unk: bool = False) \
         -> Tuple[np.ndarray, Optional[np.ndarray]]:
     """
     Greedy decoding: in each step, choose the word that gets highest score.
@@ -58,12 +63,15 @@ def recurrent_greedy(src_mask: Tensor, max_output_length: int, model: Model,
     :param model: model to use for greedy decoding
     :param encoder_output: encoder hidden states for attention
     :param encoder_hidden: encoder last state for decoder initialization
+    :param generate_unk: whether to generate UNK token. if folse,
+            the probability of UNK token will artificially be set to zero.
     :return:
         - stacked_output: output hypotheses (2d array of indices),
         - stacked_attention_scores: attention scores (3d array)
     """
     bos_index = model.bos_index
     eos_index = model.eos_index
+    unk_index = model.unk_index
     batch_size = src_mask.size(0)
     prev_y = src_mask.new_full(size=[batch_size, 1], fill_value=bos_index,
                                dtype=torch.long)
@@ -88,6 +96,8 @@ def recurrent_greedy(src_mask: Tensor, max_output_length: int, model: Model,
             # logits: batch x time=1 x vocab (logits)
 
         # greedy decoding: choose arg max over vocabulary in each step
+        if not generate_unk:
+            logits[:, :, unk_index] = float("-inf")
         next_word = torch.argmax(logits, dim=-1)  # batch x time=1
         output.append(next_word.squeeze(1).detach().cpu().numpy())
         prev_y = next_word
@@ -107,7 +117,8 @@ def recurrent_greedy(src_mask: Tensor, max_output_length: int, model: Model,
 
 
 def transformer_greedy(src_mask: Tensor, max_output_length: int, model: Model,
-                       encoder_output: Tensor, encoder_hidden: Tensor) \
+                       encoder_output: Tensor, encoder_hidden: Tensor,
+                       generate_unk: bool = False) \
         -> Tuple[np.ndarray, Optional[np.ndarray]]:
     """
     Special greedy function for transformer, since it works differently.
@@ -118,13 +129,16 @@ def transformer_greedy(src_mask: Tensor, max_output_length: int, model: Model,
     :param model: model to use for greedy decoding
     :param encoder_output: encoder hidden states for attention
     :param encoder_hidden: encoder final state (unused in Transformer)
+    :param generate_unk: whether to generate UNK token. if folse,
+            the probability of UNK token will artificially be set to zero.
     :return:
         - stacked_output: output hypotheses (2d array of indices),
         - stacked_attention_scores: attention scores (3d array)
     """
     # pylint: disable=unused-argument
     bos_index = model.bos_index
     eos_index = model.eos_index
+    unk_index = model.unk_index
     batch_size = src_mask.size(0)
 
     # start with BOS-symbol for each sentence in the batch
@@ -152,6 +166,8 @@ def transformer_greedy(src_mask: Tensor, max_output_length: int, model: Model,
                 trg_mask=trg_mask
             )
             logits = nll_logits[:, -1]
+            if not generate_unk:
+                logits[:, unk_index] = float("-inf")
             _, next_word = torch.max(logits, dim=1)
             next_word = next_word.data
             ys = torch.cat([ys, next_word.unsqueeze(-1)], dim=1)
@@ -169,8 +185,8 @@ def transformer_greedy(src_mask: Tensor, max_output_length: int, model: Model,
 
 def beam_search(model: Model, size: int, encoder_output: Tensor,
                 encoder_hidden: Tensor, src_mask: Tensor,
-                max_output_length: int, alpha: float, n_best: int = 1) \
-        -> Tuple[np.ndarray, Optional[np.ndarray]]:
+                max_output_length: int, alpha: float, n_best: int = 1,
+                generate_unk = False) -> Tuple[np.ndarray, Optional[np.ndarray]]:
     """
     Beam search with size k.
     Inspired by OpenNMT-py, adapted for Transformer.
@@ -183,6 +199,8 @@ def beam_search(model: Model, size: int, encoder_output: Tensor,
     :param max_output_length:
     :param alpha: `alpha` factor for length penalty
     :param n_best: return this many hypotheses, <= beam (currently only 1)
+    :param generate_unk: whether to generate UNK token. if folse,
+            the probability of UNK token will artificially be set to zero.
     :return:
         - stacked_output: output hypotheses (2d array of indices),
         - stacked_attention_scores: attention scores (3d array)
@@ -195,6 +213,7 @@ def beam_search(model: Model, size: int, encoder_output: Tensor,
     bos_index = model.bos_index
     eos_index = model.eos_index
     pad_index = model.pad_index
+    unk_index = model.unk_index
     trg_vocab_size = model.decoder.output_size
     device = encoder_output.device
     transformer = isinstance(model.decoder, TransformerDecoder)
@@ -316,6 +335,8 @@ def beam_search(model: Model, size: int, encoder_output: Tensor,
 
         # batch*k x trg_vocab
         log_probs = F.log_softmax(logits, dim=-1).squeeze(1)
+        if not generate_unk:
+            log_probs[:, unk_index] = float("-inf")
 
         # multiply probs by the beam probability (=add logprobs)
         log_probs += topk_log_probs.view(-1).unsqueeze(1)
@@ -439,7 +460,8 @@ def pad_and_stack_hyps(hyps, pad_value):
 
 
 def run_batch(model: Model, batch: Batch, max_output_length: int,
-              beam_size: int, beam_alpha: float, n_best: int = 1) \
+              beam_size: int, beam_alpha: float, n_best: int = 1,
+              generate_unk: bool = False) \
         -> Tuple[np.ndarray, Optional[np.ndarray]]:
     """
     Get outputs and attentions scores for a given batch
@@ -475,7 +497,8 @@ def run_batch(model: Model, batch: Batch, max_output_length: int,
             max_output_length=max_output_length,
             model=model,
             encoder_output=encoder_output,
-            encoder_hidden=encoder_hidden)
+            encoder_hidden=encoder_hidden,
+            generate_unk=generate_unk)
         # batch, time, max_src_length
     else:  # beam search
         stacked_output, stacked_attention_scores = beam_search(
@@ -486,6 +509,7 @@ def run_batch(model: Model, batch: Batch, max_output_length: int,
             src_mask=src_mask,
             max_output_length=max_output_length,
             alpha=beam_alpha,
-            n_best=n_best)
+            n_best=n_best,
+            generate_unk=generate_unk)
 
     return stacked_output, stacked_attention_scores
diff --git a/joeynmt/training.py b/joeynmt/training.py
@@ -819,10 +819,16 @@ def train(cfg_file: str, skip_test: bool = False) -> None:
     src_vocab, trg_vocab, train_data, dev_data, test_data = load_data(
         data_cfg=cfg["data"])
 
-    # store the vocabs
+    # store the vocabs and tokenizers
     if task == "MT":
         src_vocab.to_file(model_dir / "src_vocab.txt")
+        if "model_file" in cfg["data"]["src"]["spm"]:
+            src_tok = Path(cfg["data"]["src"]["spm"]["model_file"])
+            shutil.copy2(src_tok, (model_dir / src_tok.name).as_posix())
     trg_vocab.to_file(model_dir / "trg_vocab.txt")
+    if "model_file" in cfg["data"]["trg"]["spm"]:
+        trg_tok = Path(cfg["data"]["trg"]["spm"]["model_file"])
+        shutil.copy2(trg_tok, (model_dir / trg_tok.name).as_posix())
 
     # build an encoder-decoder model
     model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
diff --git a/joeynmt/vocabulary.py b/joeynmt/vocabulary.py
@@ -44,9 +44,11 @@ def __init__(self, tokens: List[str]) -> None:
         self.pad_index = self.lookup(PAD_TOKEN)
         self.bos_index = self.lookup(BOS_TOKEN)
         self.eos_index = self.lookup(EOS_TOKEN)
+        self.unk_index = self.lookup(UNK_TOKEN)
         assert self.pad_index == PAD_ID
         assert self.bos_index == BOS_ID
         assert self.eos_index == EOS_ID
+        assert self.unk_index == UNK_ID
         assert self._itos[UNK_ID] == UNK_TOKEN
 
     def __str__(self) -> str:
diff --git a/scripts/audiodata_utils.py b/scripts/audiodata_utils.py
@@ -27,27 +27,20 @@ def get_zip_manifest(zip_path: Path, npy_root: Optional[Path] = None):
     manifest = {}
     with zipfile.ZipFile(zip_path, mode="r") as f:
         info = f.infolist()
-    error_flag = []
+    # retrieve offsets
     for i in tqdm(info):
         utt_id = Path(i.filename).stem
         offset, file_size = i.header_offset + 30 + len(i.filename), i.file_size
         with zip_path.open("rb") as f:
             f.seek(offset)
             data = f.read(file_size)
-            try:
-                assert len(data) > 1 and _is_npy_data(data), (utt_id, len(data), e)
-            except Exception as e:
-                print((utt_id, len(data), e))
-                error_flag.append((utt_id, len(data)))
+            assert len(data) > 1 and _is_npy_data(data), (utt_id, len(data))
         manifest[utt_id] = f"{zip_path.name}:{offset}:{file_size}"
         # sanity check
         if npy_root is not None:
             byte_data = np.load(io.BytesIO(data))
             npy_data = np.load((npy_root / f"{utt_id}.npy").as_posix())
             assert np.allclose(byte_data, npy_data)
-    if len(error_flag) > 0:
-        print(error_flag)
-        raise Exception
     return manifest
 
 
diff --git a/scripts/prepare_covost.py b/scripts/prepare_covost.py
diff --git a/scripts/prepare_mustc.py b/scripts/prepare_mustc.py
diff --git a/speechjoey_demo.ipynb b/speechjoey_demo.ipynb