Update tts

sooftware · sooftware · commit 678f28c8f6c5 · 2021-02-12T21:52:27.000+09:00
diff --git a/examples/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/examples/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/Untitled.ipynb b/examples/Untitled.ipynb
diff --git a/pororo/models/tts/synthesizer.py b/pororo/models/tts/synthesizer.py
@@ -110,12 +110,11 @@ def _spectrogram_postprocess(self, spectrogram):
 
     def predict(self, text: str, speaker: str):
         speakers = speaker.split(',')
-
         spectrogram = synthesize(self.tacotron, f"|{text}", device=self.device)
 
         if len(speakers) > 1:
             spectrogram = self._spectrogram_postprocess(spectrogram)
-            y_g_hat = self.vocoder_en(torch.Tensor(spectrogram).to(self.device).unsqueeze(0))
+            y_g_hat = self.vocoder_ko(torch.Tensor(spectrogram).to(self.device).unsqueeze(0))
             audio = y_g_hat.squeeze()
             audio = audio * 32768.0
             return audio.cpu().detach().numpy()
diff --git a/pororo/pororo.py b/pororo/pororo.py
@@ -46,6 +46,7 @@
     PororoWordTranslationFactory,
     PororoZeroShotFactory,
     PororoSpeechTranslationFactory,
+    PororoTtsFactory,
 )
 
 SUPPORTED_TASKS = {
@@ -132,6 +133,9 @@
     "speech_recognition": PororoAsrFactory,
     "st": PororoSpeechTranslationFactory,
     "speech_translation": PororoSpeechTranslationFactory,
+    "tts": PororoTtsFactory,
+    "text_to_speech": PororoTtsFactory,
+    "speech_synthesis": PororoTtsFactory,
     "ocr": PororoOcrFactory,
     "srl": PororoSrlFactory,
     "semantic_role_labeling": PororoSrlFactory,
diff --git a/pororo/tasks/__init__.py b/pororo/tasks/__init__.py
@@ -51,3 +51,4 @@
 from pororo.tasks.zero_shot_classification import PororoZeroShotFactory
 from pororo.tasks.review_scoring import PororoReviewFactory
 from pororo.tasks.speech_translation import PororoSpeechTranslationFactory
+from pororo.tasks.speech_synthesis import PororoTtsFactory
diff --git a/pororo/tasks/speech_synthesis.py b/pororo/tasks/speech_synthesis.py
@@ -31,18 +31,24 @@ class PororoTtsFactory(PororoFactoryBase):
 
     Examples:
         >>> import IPython
-        >>> from IPython.display import Audio
+        >>> from pororo import Pororo
         >>> model = Pororo('tts', lang='multi')
-        >>> wave = model('how are you?', lang='en', speaker='en')
-        >>> IPython.display.display(IPython.display.Audio(data=wave, rate=22050))
 
-        >>> model = Pororo('tts', lang='multi')
+        >>> # Typical TTS
+        >>> wave = model('how are you?', lang='en')
+        >>> IPython.display.Audio(data=wave, rate=22050)
+
+        >>> # Voice Style Transfer
         >>> wave = model('저는 미국 사람이에요.', lang='ko', speaker='en')
-        >>> IPython.display.display(IPython.display.Audio(data=wave, rate=22050))
+        >>> IPython.display.Audio(data=wave, rate=22050)
+
+        >>> # Code-Switching
+        >>> wave = model('저는 미국 사람이에요.', lang='ko', speaker='en-15,ko')
+        >>> IPython.display.Audio(data=wave, rate=22050)
 
     Notes:
-        Currently 10 languages supports.
-        Supported Languages: English, Korean, Japanese, Chinese, Jejueo, Dutch, German, Spanish, French, Russian
+        Currently 11 languages supports.
+        Supported Languages: English, Korean, Japanese, Chinese, Jejueo, Dutch, German, Spanish, French, Russian, Finnish
         This task can designate a speaker such as ko, en, zh etc.
 
     """
@@ -72,12 +78,8 @@ def load(self, device: str):
 
         """
         if self.config.n_model == "tacotron":
-            from pororo.models.tts.synthesizer import (
-                MultilingualSpeechSynthesizer,
-            )
-            from pororo.models.tts.utils.numerical_pinyin_converter import (
-                convert_from_numerical_pinyin,
-            )
+            from pororo.models.tts.synthesizer import MultilingualSpeechSynthesizer
+            from pororo.models.tts.utils.numerical_pinyin_converter import convert_from_numerical_pinyin
             from pororo.models.tts.utils.text import jejueo_romanize, romanize
 
             tacotron_path = download_or_load("misc/tacotron2", self.config.lang)
@@ -142,7 +144,7 @@ def __init__(
         self.jejueo_romanize = jejueo_romanize
         self.convert_from_numerical_pinyin = convert_from_numerical_pinyin
 
-    def load_g2p_ja(self):
+    def _load_g2p_ja(self):
         """ load g2p module for Japanese """
         self.g2p_ja = PororoG2pFactory(
             task="g2p",
@@ -151,7 +153,7 @@ def load_g2p_ja(self):
         )
         self.g2p_ja = self.g2p_ja.load(self.device)
 
-    def load_g2p_zh(self):
+    def _load_g2p_zh(self):
         """ load g2p module for Chinese """
         self.g2p_zh = PororoG2pFactory(
             task="g2p",
@@ -182,11 +184,11 @@ def _preprocess(
             text = self.romanize(text)
         elif lang == "ja":
             if self.g2p_ja is None:
-                self.load_g2p_ja()
+                self._load_g2p_ja()
             text = self.g2p_ja(text)
         elif lang == "zh":
             if self.g2p_zh is None:
-                self.load_g2p_zh()
+                self._load_g2p_zh()
             text = self.g2p_zh(text).replace("   ", " ")
             text = self.convert_from_numerical_pinyin(text)
         elif lang == "je":
@@ -211,11 +213,5 @@ def __call__(self, text: str, lang: str = "en", speaker: str = None):
         if speaker is None:
             speaker = lang
 
-        if speaker == "ja":
-            speaker = "jp"
-
-        assert (
-            speaker in self.lang_dict.values()
-        ), f"Unsupported speaker: {speaker}\nSupported speaker: {self.lang_dict.keys()}"
         text, speaker = self._preprocess(text, lang, speaker)
         return self.predict(text, speaker)
diff --git a/tts-install.sh b/tts-install.sh