From 063ae27f1da31d2f87482d5763fef6767837f7c2 Mon Sep 17 00:00:00 2001 From: Clabiyau <40484679+Clabiyau@users.noreply.github.com> Date: Wed, 23 Jul 2025 16:16:43 +0800 Subject: [PATCH 1/3] Update frontend.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修复调用spk_id后导致部分字段被删除的bug --- cosyvoice/cli/frontend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index f98b0d61..47e72af4 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -24,6 +24,7 @@ import os import re import inflect +import copy try: import ttsfrd use_ttsfrd = True @@ -173,7 +174,7 @@ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_ 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len, 'llm_embedding': embedding, 'flow_embedding': embedding} else: - model_input = self.spk2info[zero_shot_spk_id] + model_input = copy.deepcopy(self.spk2info[zero_shot_spk_id]) model_input['text'] = tts_text_token model_input['text_len'] = tts_text_token_len return model_input From 13503bc657ee9a57f56dc6b5befccf77b5784d5f Mon Sep 17 00:00:00 2001 From: Clabiyau <40484679+Clabiyau@users.noreply.github.com> Date: Wed, 23 Jul 2025 17:56:51 +0800 Subject: [PATCH 2/3] Update frontend.py --- cosyvoice/cli/frontend.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index 47e72af4..fc0c3da1 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -175,6 +175,10 @@ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_ 'llm_embedding': embedding, 'flow_embedding': embedding} else: model_input = copy.deepcopy(self.spk2info[zero_shot_spk_id]) + if prompt_text: + prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text) + model_input["prompt_text"] = prompt_text_token + model_input["prompt_text_len"] = prompt_text_token_len model_input['text'] = tts_text_token model_input['text_len'] = tts_text_token_len return model_input From 540d80ac2a37f86f0c79a758ff9ac66865198796 Mon Sep 17 00:00:00 2001 From: Clabiyau <40484679+Clabiyau@users.noreply.github.com> Date: Thu, 24 Jul 2025 10:32:34 +0800 Subject: [PATCH 3/3] Update frontend.py --- cosyvoice/cli/frontend.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index fc0c3da1..288c38d3 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -186,16 +186,16 @@ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_ def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id): model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id) # in cross lingual mode, we remove prompt in llm - del model_input['prompt_text'] - del model_input['prompt_text_len'] - del model_input['llm_prompt_speech_token'] - del model_input['llm_prompt_speech_token_len'] + model_input.pop('prompt_text', None) + model_input.pop('prompt_text_len', None) + model_input.pop('llm_prompt_speech_token', None) + model_input.pop('llm_prompt_speech_token_len', None) return model_input def frontend_instruct(self, tts_text, spk_id, instruct_text): model_input = self.frontend_sft(tts_text, spk_id) # in instruct mode, we remove spk_embedding in llm due to information leakage - del model_input['llm_embedding'] + model_input.pop('llm_embedding', None) instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '') model_input['prompt_text'] = instruct_text_token model_input['prompt_text_len'] = instruct_text_token_len @@ -203,8 +203,8 @@ def frontend_instruct(self, tts_text, spk_id, instruct_text): def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id): model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id) - del model_input['llm_prompt_speech_token'] - del model_input['llm_prompt_speech_token_len'] + model_input.pop('llm_prompt_speech_token', None) + model_input.pop('llm_prompt_speech_token_len', None) return model_input def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):