Initial commit

bshall · May 31, 2019 · 32401d1 · 32401d1
commit 32401d1
Show file tree

Hide file tree

Showing 10 changed files with 642 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,107 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Pycharm project settings
+.idea
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+# UniversalVocoding
diff --git a/config.json b/config.json
@@ -0,0 +1,31 @@
+{
+    "preprocessing": {
+        "sample_rate": 16000,
+        "num_fft": 2048,
+        "num_mels": 80,
+        "fmin": 50,
+        "min_level_db": -100,
+        "hop_length": 200,
+        "win_length": 800,
+        "preemph": 0.97,
+        "bits": 9,
+        "num_evaluation_utterances" : 20
+    },
+
+    "vocoder": {
+        "conditioning_channels": 128,
+        "embedding_dim": 256,
+        "rnn_channels": 896,
+        "fc_channels": 512,
+        "learning_rate": 1e-4,
+        "batch_size": 16,
+        "checkpoint_interval": 25000,
+        "num_steps": 200000,
+        "sample_frames": 24,
+        "generate": {
+            "batched": true,
+            "target": 8000,
+            "overlap": 400
+        }
+    }
+}
diff --git a/dataset.py b/dataset.py
@@ -0,0 +1,32 @@
+import numpy as np
+import torch
+import os
+from random import randint
+from torch.utils.data import Dataset
+
+
+class VocoderDataset(Dataset):
+    def __init__(self, meta_file, sample_frames, hop_length, bits):
+        self.sample_frames = sample_frames
+        self.hop_length = hop_length
+        self.bits = bits
+
+        with open(meta_file, encoding="utf-8") as f:
+            self.metadata = [line.strip().split("|") for line in f]
+        self.metadata = [m for m in self.metadata if int(m[3]) > self.sample_frames + 1]
+
+    def __len__(self):
+        return len(self.metadata)
+
+    def __getitem__(self, index):
+        _, audio_path, mel_path, _ = self.metadata[index]
+
+        audio = np.load(os.path.join(audio_path))
+        mel = np.load(os.path.join(mel_path))
+
+        rand_pos = randint(0, mel.shape[0] - self.sample_frames - 2)
+
+        audio = audio[rand_pos*self.hop_length:(rand_pos + self.sample_frames) * self.hop_length + 1]
+        mel = mel[rand_pos:rand_pos + self.sample_frames, :]
+
+        return torch.LongTensor(audio), torch.FloatTensor(mel)
diff --git a/generate.py b/generate.py
@@ -0,0 +1,55 @@
+import argparse
+import os
+import json
+
+import torch
+import numpy as np
+
+from model import Vocoder
+from utils import load_wav, save_wav, melspectrogram
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--resume", type=str, help="Checkpoint path to resume")
+    parser.add_argument("--data-dir", type=str, default="./data")
+    parser.add_argument("--gen-dir", type=str, default="./generated")
+    parser.add_argument("--wav_path", type=str)
+    parser.add_argument("--batched", action='store_true')
+    args = parser.parse_args()
+    with open("config.json") as f:
+        params = json.load(f)
+    os.makedirs(args.gen_dir, exist_ok=True)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    model = Vocoder(mel_channels=params["preprocessing"]["num_mels"],
+                    conditioning_channels=params["vocoder"]["conditioning_channels"],
+                    embedding_dim=params["vocoder"]["embedding_dim"],
+                    rnn_channels=params["vocoder"]["rnn_channels"],
+                    fc_channels=params["vocoder"]["fc_channels"],
+                    bits=params["preprocessing"]["bits"],
+                    hop_length=params["preprocessing"]["hop_length"])
+    model.to(device)
+
+    print("Resume checkpoint from: {}:".format(args.resume))
+    checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage)
+    model.load_state_dict(checkpoint["model"])
+    model_step = checkpoint["steps"]
+
+    wav = load_wav(args.wav_path, params["preprocessing"]["sample_rate"])
+    utterance_id = os.path.basename(args.wav_path).split(".")[0]
+    wav = wav / np.abs(wav).max() * 0.999
+    mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"],
+                         num_mels=params["preprocessing"]["num_mels"],
+                         num_fft=params["preprocessing"]["num_fft"],
+                         preemph=params["preprocessing"]["preemph"],
+                         min_level_db=params["preprocessing"]["min_level_db"],
+                         hop_length=params["preprocessing"]["hop_length"],
+                         win_length=params["preprocessing"]["win_length"],
+                         fmin=params["preprocessing"]["fmin"])
+    mel = torch.FloatTensor(mel).unsqueeze(0).to(device)
+    output = model.generate(mel, args.batched,
+                            params["vocoder"]["generate"]["target"],
+                            params["vocoder"]["generate"]["overlap"])
+    path = os.path.join(args.gen_dir, "gen_{}_model_steps_{}.wav".format(utterance_id, model_step))
+    save_wav(path, output, params["preprocessing"]["sample_rate"])