NVIDIA
diff --git a/‎.gitmodules
+3 b/‎.gitmodules
+3
diff --git a/‎README.md
+67 b/‎README.md
+67
diff --git a/‎config.json
+37 b/‎config.json
+37
diff --git a/‎distributed.py
+184 b/‎distributed.py
+184
@@ -0,0 +1,3 @@
+[submodule "tacotron2"]
+	path = tacotron2
+	url = http://github.com/NVIDIA/tacotron2
@@ -0,0 +1,67 @@
+![WaveGlow](waveglow_logo.png "WaveGLow")
+
+## WaveGlow: a Flow-based Generative Network for Speech Synthesis
+### Ryan Prenger, Rafael Valle, and Bryan Catanzaro
+In our recent [paper], we propose WaveGlow: a flow-based network capable
+of generating high quality speech from mel-spectrograms. WaveGlow
+combines insights from [Glow] and [WaveNet] in order to provide fast,
+efficient and high-quality audio synthesis, without the need for
+auto-regression. WaveGlow is implemented using only a single network,
+trained using only a single cost function: maximizing the likelihood of
+the training data, which makes the training procedure simple and
+stable.
+
+Our [PyTorch] implementation produces audio samples at a rate of more than
+500 kHz on an NVIDIA V100 GPU and Mean Opinion Scores show that it delivers
+audio quality as good as the best publicly available WaveNet
+implementation.
+
+Visit our [website] for audio samples.
+
+## Setup
+1. Clone our repo and initialize submodule
+```
+git clone https://github.com/NVIDIA/waveglow.git
+git submodule init
+git submodule update
+```
+2. Install requirements (same as those from submodule) `pip3 install -r tacotron2/requirements.txt`
+
+## Generate audio with our pre-existing model
+1. Download our [published model]
+2. Download [mel-spectrograms]
+3. Generate audio `python3 inference.py -f <(ls mel_spectrograms/*.pt) -w waveglow_old.pt -o . --is_fp16 -s 0.6`
+
+## Train your own model
+1. Download [LJ Speech Data]. In this example it's in `data/`
+2. Make a list of the file names to use for training/testing
+```
+ls data/*.wav | tail -n+10 > train_files.txt
+ls data/*.wav | head -n10 > test_files.txt
+```
+4. Train your WaveGlow networks
+```
+mkdir checkpoints
+python train.py -c config.json
+```
+For multi-GPU training replace `train.py` with `distributed.py`.  Only tested with single node and NCCL.
+5. Make test set mel-spectrograms  
+`python mel2samp.py -f test_files.txt -o . -c config.json`
+6. Do inference with your network
+```
+ls *.pt > mel_files.txt
+python3 inference.py -f mel_files.txt -w checkpoints/waveglow_10000 -o . --is_fp16 -s 0.6
+```
+
+[//]: # (TODO)
+[//]: # (PROVIDE INSTRUCTIONS FOR DOWNLOADING LJS)
+[//]: # (TEST INSTRUCTIONS)
+[website]: https://nv-adlr.github.io/WaveGlow
+[paper]: https://arxiv.org/abs/1811.00002
+[WaveNet implementation]: https://github.com/r9y9/wavenet_vocoder
+[Glow]: https://blog.openai.com/glow/
+[WaveNet]: https://deepmind.com/blog/wavenet-generative-model-raw-audio/
+[PyTorch]: http://pytorch.org
+[published model]: https://drive.google.com/file/d/1cjKPHbtAMh_4HTHmuIGNkbOkPBD9qwhj/view?usp=sharing
+[mel-spectrograms]: https://drive.google.com/file/d/1g_VXK2lpP9J25dQFhQwx7doWl_p20fXA/view?usp=sharing
+[LJ Speech Data]: https://keithito.com/LJ-Speech-Dataset
@@ -0,0 +1,37 @@
+{
+    "train_config": {
+        "output_directory": "checkpoints",
+        "epochs": 100000,
+        "learning_rate": 1e-4,
+        "sigma": 1.0,
+        "iters_per_checkpoint": 2000,
+        "batch_size": 3,
+        "seed": 1234,
+        "checkpoint_path": ""
+    },
+    "data_config": {
+        "training_files":"train_filelist.txt",
+        "segment_length": 16000,
+        "sampling_rate": 22050,
+        "filter_length": 1024,
+        "hop_length": 256,
+        "win_length": 1024
+    },
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321"
+    },
+
+    "waveglow_config": {
+        "n_mel_channels": 80,
+        "n_flows": 12,
+        "n_group": 8,
+        "n_early_every": 4,
+        "n_early_size": 2,
+        "WN_config": {
+            "n_layers": 8,
+            "n_channels": 512,
+            "kernel_size": 3
+        }
+    }
+}
@@ -0,0 +1,184 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import os
+import sys
+import time
+import subprocess
+import argparse
+
+import torch
+import torch.distributed as dist
+from torch.autograd import Variable
+
+def reduce_tensor(tensor, num_gpus):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.reduce_op.SUM)
+    rt /= num_gpus
+    return rt
+
+def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
+    assert torch.cuda.is_available(), "Distributed mode requires CUDA."
+    print("Initializing Distributed")
+
+    # Set cuda device so everything is done on the right GPU.
+    torch.cuda.set_device(rank % torch.cuda.device_count())
+
+    # Initialize distributed communication
+    dist.init_process_group(dist_backend, init_method=dist_url,
+                            world_size=num_gpus, rank=rank,
+                            group_name=group_name)
+
+def _flatten_dense_tensors(tensors):
+    """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
+    same dense type.
+    Since inputs are dense, the resulting tensor will be a concatenated 1D
+    buffer. Element-wise operation on this buffer will be equivalent to
+    operating individually.
+    Arguments:
+        tensors (Iterable[Tensor]): dense tensors to flatten.
+    Returns:
+        A contiguous 1D buffer containing input tensors.
+    """
+    if len(tensors) == 1:
+        return tensors[0].contiguous().view(-1)
+    flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
+    return flat
+
+def _unflatten_dense_tensors(flat, tensors):
+    """View a flat buffer using the sizes of tensors. Assume that tensors are of
+    same dense type, and that flat is given by _flatten_dense_tensors.
+    Arguments:
+        flat (Tensor): flattened dense tensors to unflatten.
+        tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
+          unflatten flat.
+    Returns:
+        Unflattened dense tensors with sizes same as tensors and values from
+        flat.
+    """
+    outputs = []
+    offset = 0
+    for tensor in tensors:
+        numel = tensor.numel()
+        outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
+        offset += numel
+    return tuple(outputs)
+
+def apply_gradient_allreduce(module):
+    """
+    Modifies existing model to do gradient allreduce, but doesn't change class
+    so you don't need "module"
+    """
+    if not hasattr(dist, '_backend'):
+        module.warn_on_half = True
+    else:
+        module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+    for p in module.state_dict().values():
+        if not torch.is_tensor(p):
+            continue
+        dist.broadcast(p, 0)
+
+    def allreduce_params():
+        if(module.needs_reduction):
+            module.needs_reduction = False
+            buckets = {}
+            for param in module.parameters():
+                if param.requires_grad and param.grad is not None:
+                    tp = type(param.data)
+                    if tp not in buckets:
+                        buckets[tp] = []
+                    buckets[tp].append(param)
+            if module.warn_on_half:
+                if torch.cuda.HalfTensor in buckets:
+                    print("WARNING: gloo dist backend for half parameters may be extremely slow." +
+                          " It is recommended to use the NCCL backend in this case. This currently requires" +
+                          "PyTorch built from top of tree master.")
+                    module.warn_on_half = False
+
+            for tp in buckets:
+                bucket = buckets[tp]
+                grads = [param.grad.data for param in bucket]
+                coalesced = _flatten_dense_tensors(grads)
+                dist.all_reduce(coalesced)
+                coalesced /= dist.get_world_size()
+                for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                    buf.copy_(synced)
+
+    for param in list(module.parameters()):
+        def allreduce_hook(*unused):
+            Variable._execution_engine.queue_callback(allreduce_params)
+        if param.requires_grad:
+            param.register_hook(allreduce_hook)
+            dir(param)
+
+    def set_needs_reduction(self, input, output):
+        self.needs_reduction = True
+
+    module.register_forward_hook(set_needs_reduction)
+    return module
+
+
+def main(config, stdout_dir, args_str):
+    args_list = ['train.py']
+    args_list += args_str.split(' ') if len(args_str) > 0 else []
+
+    args_list.append('--config={}'.format(config))
+
+    num_gpus = torch.cuda.device_count()
+    args_list.append('--num_gpus={}'.format(num_gpus))
+    args_list.append("--group_name=group_{}".format(time.strftime("%Y_%m_%d-%H%M%S")))
+
+    if not os.path.isdir(stdout_dir):
+        os.makedirs(stdout_dir)
+        os.chmod(stdout_dir, 0o775)
+
+    workers = []
+
+    for i in range(num_gpus):
+        args_list[-2] = '--rank={}'.format(i)
+        stdout = None if i == 0 else open(
+            os.path.join(stdout_dir, "GPU_{}.log".format(i)), "w")
+        print(args_list)
+        p = subprocess.Popen([str(sys.executable)]+args_list, stdout=stdout)
+        workers.append(p)
+
+    for p in workers:
+        p.wait()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, required=True,
+                        help='JSON file for configuration')
+    parser.add_argument('-s', '--stdout_dir', type=str, default=".",
+                        help='directory to save stoud logs')
+    parser.add_argument(
+        '-a', '--args_str', type=str, default='',
+        help='double quoted string with space separated key value pairs')
+
+    args = parser.parse_args()
+    main(args.config, args.stdout_dir, args.args_str)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "tacotron2"]`
	`2`	`+ path = tacotron2`
	`3`	`+ url = http://github.com/NVIDIA/tacotron2`