Add recipe for the yes_no dataset. (#16)

* Add recipe for the yes_no dataset. * Refactoring: Remove unused code. * Add Colab notebook for the yesno dataset. * Add GitHub actions to run yesno. * Fix a typo. * Minor fixes. * Train more epochs for GitHub actions. * Minor fixes. * Minor fixes. * Fix style issues.
k2-fsa · Aug 23, 2021 · 6c2c9b9 · 6c2c9b9
1 parent 19c4214
commit 6c2c9b9
Show file tree

Hide file tree

Showing 17 changed files with 2,012 additions and 9 deletions.
diff --git a/.github/workflows/run-yesno-recipe.yml b/.github/workflows/run-yesno-recipe.yml
@@ -0,0 +1,89 @@
+# Copyright      2021  Fangjun Kuang ([email protected])
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-yesno-recipe
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  run-yesno-recipe:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        # os: [ubuntu-18.04, macos-10.15]
+        # TODO: enable macOS for CPU testing
+        os: [ubuntu-18.04]
+        python-version: [3.8]
+      fail-fast: false
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v1
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install libnsdfile and libsox
+        if: startsWith(matrix.os, 'ubuntu')
+        run: |
+          sudo apt update
+          sudo apt install -q -y libsndfile1-dev libsndfile1 ffmpeg
+          sudo apt install -q -y --fix-missing sox libsox-dev libsox-fmt-all
+
+      - name: Install Python dependencies
+        run: |
+          python3 -m pip install --upgrade pip black flake8
+          python3 -m pip install -U pip
+          python3 -m pip install k2==1.4.dev20210822+cpu.torch1.7.1 -f https://k2-fsa.org/nightly/
+          python3 -m pip install torchaudio==0.7.2
+          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
+
+          # We are in ./icefall and there is a file: requirements.txt in it
+          python3 -m pip install -r requirements.txt
+
+      - name: Run yesno recipe
+        shell: bash
+        working-directory: ${{github.workspace}}
+        run: |
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          echo $PYTHONPATH
+          ls -lh
+
+          # The following three lines are for macOS
+          lib_path=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")
+          echo "lib_path: $lib_path"
+          export DYLD_LIBRARY_PATH=$lib_path:$DYLD_LIBRARY_PATH
+          ls -lh $lib_path
+
+          cd egs/yesno/ASR
+          ./prepare.sh
+          python3 ./tdnn/train.py --num-epochs 100
+          python3 ./tdnn/decode.py --epoch 99
+          python3 ./tdnn/decode.py --epoch 95
+          python3 ./tdnn/decode.py --epoch 90
+          python3 ./tdnn/decode.py --epoch 80
+          python3 ./tdnn/decode.py --epoch 70
+          python3 ./tdnn/decode.py --epoch 60
+          # TODO: Check that the WER is less than some value
diff --git a/README.md b/README.md
@@ -48,10 +48,22 @@ python3 -c "import icefall; print(icefall.__file__)"
 
 It should print the path to `icefall`.
 
-## Run recipes
+## Recipes
 
-At present, only LibriSpeech recipe is provided. Please
-follow [egs/librispeech/ASR/README.md][LibriSpeech] to run it.
+At present, two recipes are provided:
+
+  - [LibriSpeech][LibriSpeech]
+  - [yesno][yesno] [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tIjjzaJc3IvGyKiMCDWO-TSnBgkcuN3B?usp=sharing)
+
+### Yesno
+
+For the yesno recipe, training with 50 epochs takes less than 2 minutes using **CPU**.
+
+The WER is
+
+```
+[test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
+```
 
 ## Use Pre-trained models
 
@@ -60,6 +72,7 @@ for how to use pre-trained models.
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1huyupXAcHsUrKaWfI83iMEJ6J0Nh0213?usp=sharing)
 
 
+[yesno]: egs/yesno/ASR/README.md
 [LibriSpeech]: egs/librispeech/ASR/README.md
 [k2-install]: https://k2.readthedocs.io/en/latest/installation/index.html#
 [k2]: https://github.com/k2-fsa/k2

diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@@ -18,7 +18,7 @@
 
 """
 This file computes fbank features of the LibriSpeech dataset.
-Its looks for manifests in the directory data/manifests.
+It looks for manifests in the directory data/manifests.
 
 The generated fbank features are saved in data/fbank.
 """

diff --git a/egs/librispeech/ASR/local/compute_fbank_musan.py b/egs/librispeech/ASR/local/compute_fbank_musan.py
@@ -18,7 +18,7 @@
 
 """
 This file computes fbank features of the musan dataset.
-Its looks for manifests in the directory data/manifests.
+It looks for manifests in the directory data/manifests.
 
 The generated fbank features are saved in data/fbank.
 """

diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
@@ -1,4 +1,4 @@
-# Copyright      2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+# Copyright      2021  Piotr Żelasko
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@@ -40,7 +40,7 @@
 
 class LibriSpeechAsrDataModule(DataModule):
     """
-    DataModule for K2 ASR experiments.
+    DataModule for k2 ASR experiments.
     It assumes there is always one train and valid dataloader,
     but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
     and test-other).

diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py b/egs/librispeech/ASR/tdnn_lstm_ctc/decode.py
@@ -348,7 +348,7 @@ def main():
     logging.info(f"device: {device}")
 
     HLG = k2.Fsa.from_dict(
-        torch.load("data/lang_phone/HLG.pt", map_location="cpu")
+        torch.load(f"{params.lang_dir}/HLG.pt", map_location="cpu")
     )
     HLG = HLG.to(device)
     assert HLG.requires_grad is False

diff --git a/egs/yesno/ASR/README.md b/egs/yesno/ASR/README.md
@@ -0,0 +1,15 @@
+## Yesno recipe
+
+You can run the recipe with **CPU**.
+
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tIjjzaJc3IvGyKiMCDWO-TSnBgkcuN3B?usp=sharing)
+
+The above Colab notebook finishes the training using **CPU**
+within two minutes (50 epochs in total).
+
+The WER is
+
+```
+[test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
+```
diff --git a/egs/yesno/ASR/local/compile_hlg.py b/egs/yesno/ASR/local/compile_hlg.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+"""
+This script takes as input lang_dir and generates HLG from
+
+    - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt
+    - L, the lexicon, built from lang_dir/L_disambig.pt
+
+        Caution: We use a lexicon that contains disambiguation symbols
+
+    - G, the LM, built from data/lm/G.fst.txt
+
+The generated HLG is saved in $lang_dir/HLG.pt
+"""
+import argparse
+import logging
+from pathlib import Path
+
+import k2
+import torch
+
+from icefall.lexicon import Lexicon
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        """,
+    )
+
+    return parser.parse_args()
+
+
+def compile_HLG(lang_dir: str) -> k2.Fsa:
+    """
+    Args:
+      lang_dir:
+        The language directory, e.g., data/lang_phone or data/lang_bpe_5000.
+
+    Return:
+      An FSA representing HLG.
+    """
+    lexicon = Lexicon(lang_dir)
+    max_token_id = max(lexicon.tokens)
+    logging.info(f"Building ctc_topo. max_token_id: {max_token_id}")
+    H = k2.ctc_topo(max_token_id)
+    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
+
+    logging.info("Loading G.fst.txt")
+    with open("data/lm/G.fst.txt") as f:
+        G = k2.Fsa.from_openfst(f.read(), acceptor=False)
+
+    first_token_disambig_id = lexicon.token_table["#0"]
+    first_word_disambig_id = lexicon.word_table["#0"]
+
+    L = k2.arc_sort(L)
+    G = k2.arc_sort(G)
+
+    logging.info("Intersecting L and G")
+    LG = k2.compose(L, G)
+    logging.info(f"LG shape: {LG.shape}")
+
+    logging.info("Connecting LG")
+    LG = k2.connect(LG)
+    logging.info(f"LG shape after k2.connect: {LG.shape}")
+
+    logging.info(type(LG.aux_labels))
+    logging.info("Determinizing LG")
+
+    LG = k2.determinize(LG)
+    logging.info(type(LG.aux_labels))
+
+    logging.info("Connecting LG after k2.determinize")
+    LG = k2.connect(LG)
+
+    logging.info("Removing disambiguation symbols on LG")
+
+    LG.labels[LG.labels >= first_token_disambig_id] = 0
+
+    assert isinstance(LG.aux_labels, k2.RaggedInt)
+    LG.aux_labels.values()[LG.aux_labels.values() >= first_word_disambig_id] = 0
+
+    LG = k2.remove_epsilon(LG)
+    logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}")
+
+    LG = k2.connect(LG)
+    LG.aux_labels = k2.ragged.remove_values_eq(LG.aux_labels, 0)
+
+    logging.info("Arc sorting LG")
+    LG = k2.arc_sort(LG)
+
+    logging.info("Composing H and LG")
+    # CAUTION: The name of the inner_labels is fixed
+    # to `tokens`. If you want to change it, please
+    # also change other places in icefall that are using
+    # it.
+    HLG = k2.compose(H, LG, inner_labels="tokens")
+
+    logging.info("Connecting LG")
+    HLG = k2.connect(HLG)
+
+    logging.info("Arc sorting LG")
+    HLG = k2.arc_sort(HLG)
+    logging.info(f"HLG.shape: {HLG.shape}")
+
+    return HLG
+
+
+def main():
+    args = get_args()
+    lang_dir = Path(args.lang_dir)
+
+    if (lang_dir / "HLG.pt").is_file():
+        logging.info(f"{lang_dir}/HLG.pt already exists - skipping")
+        return
+
+    logging.info(f"Processing {lang_dir}")
+
+    HLG = compile_HLG(lang_dir)
+    logging.info(f"Saving HLG.pt to {lang_dir}")
+    torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt")
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
diff --git a/egs/yesno/ASR/local/compute_fbank_yesno.py b/egs/yesno/ASR/local/compute_fbank_yesno.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+"""
+This file computes fbank features of the yesno dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+
+import logging
+import os
+from pathlib import Path
+
+import torch
+from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse.recipes.utils import read_manifests_if_cached
+
+from icefall.utils import get_executor
+
+# Torch's multithreaded behavior needs to be disabled or it wastes a
+# lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+
+def compute_fbank_yesno():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    # This dataset is rather small, so we use only one job
+    num_jobs = min(1, os.cpu_count())
+    num_mel_bins = 23
+
+    dataset_parts = (
+        "train",
+        "test",
+    )
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts, output_dir=src_dir
+    )
+    assert manifests is not None
+
+    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+
+    with get_executor() as ex:  # Initialize the executor only once.
+        for partition, m in manifests.items():
+            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+                logging.info(f"{partition} already exists - skipping.")
+                continue
+            logging.info(f"Processing {partition}")
+            cut_set = CutSet.from_manifests(
+                recordings=m["recordings"],
+                supervisions=m["supervisions"],
+            )
+            if "train" in partition:
+                cut_set = (
+                    cut_set
+                    + cut_set.perturb_speed(0.9)
+                    + cut_set.perturb_speed(1.1)
+                )
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                storage_path=f"{output_dir}/feats_{partition}",
+                # when an executor is specified, make more partitions
+                num_jobs=num_jobs if ex is None else 1,  # use one job
+                executor=ex,
+                storage_type=LilcomHdf5Writer,
+            )
+            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    compute_fbank_yesno()