If the seq2seq doesn't predict any spaces in the MWT, use the origina…

…l word to avoid it going crazy
stanfordnlp · Sep 12, 2024 · 3211e72 · 3211e72
1 parent 15e1408
commit 3211e72
Showing 1 changed file with 4 additions and 1 deletion.
diff --git a/stanza/models/mwt/trainer.py b/stanza/models/mwt/trainer.py
@@ -114,7 +114,10 @@ def predict(self, batch, unsort=True, never_decode_unk=False, vocab=None):
             # if any tokens are predicted to expand to blank,
             # that is likely an error.  use the original text
             # this originally came up with the Spanish model turning 's' into a blank
-            pred_tokens = [x if x else y for x, y in zip(pred_tokens, orig_text)]
+            # furthermore, if there are no spaces predicted by the seq2seq,
+            # might as well use the original in case the seq2seq went crazy
+            # this particular error came up training a Hebrew MWT
+            pred_tokens = [x if x and ' ' in x else y for x, y in zip(pred_tokens, orig_text)]
         if unsort:
             pred_tokens = utils.unsort(pred_tokens, orig_idx)
         return pred_tokens