rhasspy · contentnation · Feb 23, 2024 · Feb 23, 2024 · Apr 3, 2024 · Apr 3, 2024
diff --git a/src/python_run/piper/__main__.py b/src/python_run/piper/__main__.py
@@ -80,6 +80,10 @@ def main() -> None:
     parser.add_argument(
         "--debug", action="store_true", help="Print DEBUG messages to console"
     )
+    #
+    parser.add_argument(
+        "--alignment-data", help="output alignment data to file (default: stdout)"
+    )
     args = parser.parse_args()
     logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
     _LOGGER.debug(args)
@@ -113,6 +117,8 @@ def main() -> None:
         "noise_w": args.noise_w,
         "sentence_silence": args.sentence_silence,
     }
+    if args.alignment_data:
+        synthesize_args['alignment_data'] = []
 
     if args.output_raw:
         # Read line-by-line
@@ -154,6 +160,15 @@ def main() -> None:
             with wave.open(args.output_file, "wb") as wav_file:
                 voice.synthesize(text, wav_file, **synthesize_args)
 
+    if args.alignment_data:
+        if (not args.alignment_data) or (args.alignment_data == "-"):
+            fh = sys.stdout
+        else:
+            fh = open(args.alignment_data, "w")
+        fh.write("start\tend\ttext\n")
+        for word in synthesize_args['alignment_data']:
+            fh.write("%d\t%d\t%s\n" % (word["start"] * 1000, word["end"] * 1000, word["word"]))
+        fh.close()
 
 if __name__ == "__main__":
     main()
diff --git a/src/python_run/piper/voice.py b/src/python_run/piper/voice.py
@@ -12,14 +12,39 @@
 from .config import PhonemeType, PiperConfig
 from .const import BOS, EOS, PAD
 from .util import audio_float_to_int16
+import struct
+import numpy
 
 _LOGGER = logging.getLogger(__name__)
 
 
+def levenshteinDistanceDP(list1, list2):
+    distances = numpy.zeros((len(list1) + 1, len(list2) + 1))
+    for l1 in range(len(list1) + 1):
+        distances[l1][0] = l1
+    for l2 in range(len(list2) + 1):
+        distances[0][l2] = l2
+    for l1 in range(1, len(list1) + 1):
+        for l2 in range(1, len(list2) + 1):
+            if (list1[l1-1] == list2[l2-1]):
+                distances[l1][l2] = distances[l1 - 1][l2 - 1]
+            else:
+                a = distances[l1][l2 - 1]
+                b = distances[l1 - 1][l2]
+                c = distances[l1 - 1][l2 - 1]
+                if (a <= b and a <= c):
+                    distances[l1][l2] = a + 1
+                elif (b <= a and b <= c):
+                    distances[l1][l2] = b + 1
+                else:
+                    distances[l1][l2] = c + 1
+    return distances[len(list1)][len(list2)]
+
 @dataclass
 class PiperVoice:
     session: onnxruntime.InferenceSession
     config: PiperConfig
+    global_time: int
 
     @staticmethod
     def load(
@@ -45,13 +70,17 @@ def load(
         else:
             providers = ["CPUExecutionProvider"]
 
+        # set a seed ro reduce randomness (for debug)
+        # onnxruntime.set_seed(0)
+
         return PiperVoice(
             config=PiperConfig.from_dict(config_dict),
             session=onnxruntime.InferenceSession(
                 str(model_path),
                 sess_options=onnxruntime.SessionOptions(),
                 providers=providers,
             ),
+            global_time = 0
         )
 
     def phonemize(self, text: str) -> List[List[str]]:
@@ -95,6 +124,7 @@ def synthesize(
         noise_scale: Optional[float] = None,
         noise_w: Optional[float] = None,
         sentence_silence: float = 0.0,
+        alignment_data: Optional[list] = None
     ):
         """Synthesize WAV audio from text."""
         wav_file.setframerate(self.config.sample_rate)
@@ -108,6 +138,7 @@ def synthesize(
             noise_scale=noise_scale,
             noise_w=noise_w,
             sentence_silence=sentence_silence,
+            alignment_data=alignment_data
         ):
             wav_file.writeframes(audio_bytes)
 
@@ -119,23 +150,95 @@ def synthesize_stream_raw(
         noise_scale: Optional[float] = None,
         noise_w: Optional[float] = None,
         sentence_silence: float = 0.0,
+        alignment_data: Optional[list] = None
     ) -> Iterable[bytes]:
         """Synthesize raw audio per sentence from text."""
         sentence_phonemes = self.phonemize(text)
 
         # 16-bit mono
         num_silence_samples = int(sentence_silence * self.config.sample_rate)
         silence_bytes = bytes(num_silence_samples * 2)
-
+        text = text.replace("/", " / ")
+        fulltext = text.split(" ")
         for phonemes in sentence_phonemes:
+            if alignment_data != None:
+                sentence_length = 0
+                word_length = []
+                sentence_phonemes = []
+                sentence_text = []
+                word = []
+                # split sentence in words by ' '
+                for letter in phonemes:
+                    if letter != ' ':
+                        word.append(letter)
+                    else:
+                      sentence_phonemes.append(word)
+                      word = []
+                if (len(word) > 0):
+                    sentence_phonemes.append(word)
+                # create temp audio for words
+                for wordphonemes in sentence_phonemes:
+                    word = fulltext[0]
+                    fulltext.pop(0)
+                    wordph = self.phonemize(word)[0]
+                    # different phonemes and do we have just a phoneme more in the end?
+                    if wordph != wordphonemes and wordph[1:] == wordphonemes[0:len(wordph)-1]:
+                            wordph.pop(0)
+                    # different phonemes do we get a match if we combine with the next word?
+                    if wordph != wordphonemes and len(wordphonemes) > len(wordph):
+                        nextword = fulltext[0]
+                        combinedword = word + " " + nextword
+                        combinedph = self.phonemize(combinedword)[0]
+                        if combinedph == wordphonemes or levenshteinDistanceDP(combinedph, wordphonemes) <= 2:
+                          word = combinedword
+                          wordph = combinedph
+                          fulltext.pop(0)
+                    phoneme_ids = self.phonemes_to_ids(wordph)
+                    wordraw = self.synthesize_ids_to_raw(
+                        phoneme_ids,
+                        speaker_id=speaker_id,
+                        length_scale=length_scale,
+                        noise_scale=noise_scale,
+                        noise_w=noise_w,
+                    )
+                    length = len(wordraw)
+                    sentence_text.append(word)
+                    sentence_length += length
+                    word_length.append(length)
+            # create real audio
             phoneme_ids = self.phonemes_to_ids(phonemes)
-            yield self.synthesize_ids_to_raw(
+            raw = self.synthesize_ids_to_raw(
                 phoneme_ids,
                 speaker_id=speaker_id,
                 length_scale=length_scale,
                 noise_scale=noise_scale,
                 noise_w=noise_w,
-            ) + silence_bytes
+            )
+            if alignment_data != None:
+                # fix length discrepancy
+                start = 0
+                end = 0
+                is_start = True
+                global_time_start = self.global_time
+                # detect "silence" at start and end
+                for index in range(0, len(raw) - 2, 2):
+                    a = struct.unpack('<h',raw[index:index + 2])[0]
+                    if abs(a) > 1500:
+                        if is_start:
+                            start = index
+                            is_start = False;
+                        end = index
+                # forward global time with found silence
+                self.global_time = self.global_time + start / 2 / self.config.sample_rate
+                # length correction factor, single word vs sentence
+                correction_factor = ((end - start) / sentence_length) / (2 * self.config.sample_rate)
+                for index, w in enumerate(word_length):
+                    length = w * correction_factor
+                    alignment_data.append({"word": sentence_text[index], "start": self.global_time, "end": self.global_time + length})
+                    self.global_time += length
+                # forward global time with found silence at the end
+                self.global_time = global_time_start + (len(raw) + len(silence_bytes)) / 2 / self.config.sample_rate
+            yield raw + silence_bytes
 
     def synthesize_ids_to_raw(
         self,