Skip to content
This repository was archived by the owner on Oct 6, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/python_run/piper/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ def main() -> None:
parser.add_argument(
"--debug", action="store_true", help="Print DEBUG messages to console"
)
#
parser.add_argument(
"--alignment-data", help="output alignment data to file (default: stdout)"
)
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
_LOGGER.debug(args)
Expand Down Expand Up @@ -113,6 +117,8 @@ def main() -> None:
"noise_w": args.noise_w,
"sentence_silence": args.sentence_silence,
}
if args.alignment_data:
synthesize_args['alignment_data'] = []

if args.output_raw:
# Read line-by-line
Expand Down Expand Up @@ -154,6 +160,15 @@ def main() -> None:
with wave.open(args.output_file, "wb") as wav_file:
voice.synthesize(text, wav_file, **synthesize_args)

if args.alignment_data:
if (not args.alignment_data) or (args.alignment_data == "-"):
fh = sys.stdout
else:
fh = open(args.alignment_data, "w")
fh.write("start\tend\ttext\n")
for word in synthesize_args['alignment_data']:
fh.write("%d\t%d\t%s\n" % (word["start"] * 1000, word["end"] * 1000, word["word"]))
fh.close()

if __name__ == "__main__":
main()
109 changes: 106 additions & 3 deletions src/python_run/piper/voice.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,39 @@
from .config import PhonemeType, PiperConfig
from .const import BOS, EOS, PAD
from .util import audio_float_to_int16
import struct
import numpy

_LOGGER = logging.getLogger(__name__)


def levenshteinDistanceDP(list1, list2):
distances = numpy.zeros((len(list1) + 1, len(list2) + 1))
for l1 in range(len(list1) + 1):
distances[l1][0] = l1
for l2 in range(len(list2) + 1):
distances[0][l2] = l2
for l1 in range(1, len(list1) + 1):
for l2 in range(1, len(list2) + 1):
if (list1[l1-1] == list2[l2-1]):
distances[l1][l2] = distances[l1 - 1][l2 - 1]
else:
a = distances[l1][l2 - 1]
b = distances[l1 - 1][l2]
c = distances[l1 - 1][l2 - 1]
if (a <= b and a <= c):
distances[l1][l2] = a + 1
elif (b <= a and b <= c):
distances[l1][l2] = b + 1
else:
distances[l1][l2] = c + 1
return distances[len(list1)][len(list2)]

@dataclass
class PiperVoice:
session: onnxruntime.InferenceSession
config: PiperConfig
global_time: int

@staticmethod
def load(
Expand All @@ -45,13 +70,17 @@ def load(
else:
providers = ["CPUExecutionProvider"]

# set a seed ro reduce randomness (for debug)
# onnxruntime.set_seed(0)

return PiperVoice(
config=PiperConfig.from_dict(config_dict),
session=onnxruntime.InferenceSession(
str(model_path),
sess_options=onnxruntime.SessionOptions(),
providers=providers,
),
global_time = 0
)

def phonemize(self, text: str) -> List[List[str]]:
Expand Down Expand Up @@ -95,6 +124,7 @@ def synthesize(
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
sentence_silence: float = 0.0,
alignment_data: Optional[list] = None
):
"""Synthesize WAV audio from text."""
wav_file.setframerate(self.config.sample_rate)
Expand All @@ -108,6 +138,7 @@ def synthesize(
noise_scale=noise_scale,
noise_w=noise_w,
sentence_silence=sentence_silence,
alignment_data=alignment_data
):
wav_file.writeframes(audio_bytes)

Expand All @@ -119,23 +150,95 @@ def synthesize_stream_raw(
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
sentence_silence: float = 0.0,
alignment_data: Optional[list] = None
) -> Iterable[bytes]:
"""Synthesize raw audio per sentence from text."""
sentence_phonemes = self.phonemize(text)

# 16-bit mono
num_silence_samples = int(sentence_silence * self.config.sample_rate)
silence_bytes = bytes(num_silence_samples * 2)

text = text.replace("/", " / ")
fulltext = text.split(" ")
for phonemes in sentence_phonemes:
if alignment_data != None:
sentence_length = 0
word_length = []
sentence_phonemes = []
sentence_text = []
word = []
# split sentence in words by ' '
for letter in phonemes:
if letter != ' ':
word.append(letter)
else:
sentence_phonemes.append(word)
word = []
if (len(word) > 0):
sentence_phonemes.append(word)
# create temp audio for words
for wordphonemes in sentence_phonemes:
word = fulltext[0]
fulltext.pop(0)
wordph = self.phonemize(word)[0]
# different phonemes and do we have just a phoneme more in the end?
if wordph != wordphonemes and wordph[1:] == wordphonemes[0:len(wordph)-1]:
wordph.pop(0)
# different phonemes do we get a match if we combine with the next word?
if wordph != wordphonemes and len(wordphonemes) > len(wordph):
nextword = fulltext[0]
combinedword = word + " " + nextword
combinedph = self.phonemize(combinedword)[0]
if combinedph == wordphonemes or levenshteinDistanceDP(combinedph, wordphonemes) <= 2:
word = combinedword
wordph = combinedph
fulltext.pop(0)
phoneme_ids = self.phonemes_to_ids(wordph)
wordraw = self.synthesize_ids_to_raw(
phoneme_ids,
speaker_id=speaker_id,
length_scale=length_scale,
noise_scale=noise_scale,
noise_w=noise_w,
)
length = len(wordraw)
sentence_text.append(word)
sentence_length += length
word_length.append(length)
# create real audio
phoneme_ids = self.phonemes_to_ids(phonemes)
yield self.synthesize_ids_to_raw(
raw = self.synthesize_ids_to_raw(
phoneme_ids,
speaker_id=speaker_id,
length_scale=length_scale,
noise_scale=noise_scale,
noise_w=noise_w,
) + silence_bytes
)
if alignment_data != None:
# fix length discrepancy
start = 0
end = 0
is_start = True
global_time_start = self.global_time
# detect "silence" at start and end
for index in range(0, len(raw) - 2, 2):
a = struct.unpack('<h',raw[index:index + 2])[0]
if abs(a) > 1500:
if is_start:
start = index
is_start = False;
end = index
# forward global time with found silence
self.global_time = self.global_time + start / 2 / self.config.sample_rate
# length correction factor, single word vs sentence
correction_factor = ((end - start) / sentence_length) / (2 * self.config.sample_rate)
for index, w in enumerate(word_length):
length = w * correction_factor
alignment_data.append({"word": sentence_text[index], "start": self.global_time, "end": self.global_time + length})
self.global_time += length
# forward global time with found silence at the end
self.global_time = global_time_start + (len(raw) + len(silence_bytes)) / 2 / self.config.sample_rate
yield raw + silence_bytes

def synthesize_ids_to_raw(
self,
Expand Down