fix the CTC zipformer2 training

- too many supervision tokens - change filtering rule to `if (T - 2) < len(tokens): return False` - this prevents inf. from appearing in the CTC loss value
k2-fsa · KarelVesely84 · Aug 12, 2024 · Aug 12, 2024 · d400bc5edf3a3510d29497b9a7b6b1d1d8eb730d
commit d400bc5edf3a3510d29497b9a7b6b1d1d8eb730d
diff --git a/egs/librispeech/ASR/zipformer/train.py b/egs/librispeech/ASR/zipformer/train.py
@@ -1300,9 +1300,11 @@ def remove_short_and_long_utt(c: Cut):
         T = ((c.num_frames - 7) // 2 + 1) // 2
         tokens = sp.encode(c.supervisions[0].text, out_type=str)
 
-        if T < len(tokens):
+        # For CTC `(T - 2)  < len(tokens)` is needed. otherwise inf. in loss appears.
+        # For Transducer `T < len(tokens)` was okay.
+        if (T - 2) < len(tokens):
             logging.warning(
-                f"Exclude cut with ID {c.id} from training. "
+                f"Exclude cut with ID {c.id} from training (too many supervision tokens). "
                 f"Number of frames (before subsampling): {c.num_frames}. "
                 f"Number of frames (after subsampling): {T}. "
                 f"Text: {c.supervisions[0].text}. "