From 063ae27f1da31d2f87482d5763fef6767837f7c2 Mon Sep 17 00:00:00 2001
From: Clabiyau <40484679+Clabiyau@users.noreply.github.com>
Date: Wed, 23 Jul 2025 16:16:43 +0800
Subject: [PATCH 1/3] Update frontend.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复调用spk_id后导致部分字段被删除的bug
---
 cosyvoice/cli/frontend.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index f98b0d61..47e72af4 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -24,6 +24,7 @@
 import os
 import re
 import inflect
+import copy
 try:
     import ttsfrd
     use_ttsfrd = True
@@ -173,7 +174,7 @@ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_
                            'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
                            'llm_embedding': embedding, 'flow_embedding': embedding}
         else:
-            model_input = self.spk2info[zero_shot_spk_id]
+            model_input = copy.deepcopy(self.spk2info[zero_shot_spk_id])
         model_input['text'] = tts_text_token
         model_input['text_len'] = tts_text_token_len
         return model_input

From 13503bc657ee9a57f56dc6b5befccf77b5784d5f Mon Sep 17 00:00:00 2001
From: Clabiyau <40484679+Clabiyau@users.noreply.github.com>
Date: Wed, 23 Jul 2025 17:56:51 +0800
Subject: [PATCH 2/3] Update frontend.py

---
 cosyvoice/cli/frontend.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index 47e72af4..fc0c3da1 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -175,6 +175,10 @@ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_
                            'llm_embedding': embedding, 'flow_embedding': embedding}
         else:
             model_input = copy.deepcopy(self.spk2info[zero_shot_spk_id])
+            if prompt_text:
+                prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+                model_input["prompt_text"] = prompt_text_token
+                model_input["prompt_text_len"] = prompt_text_token_len
         model_input['text'] = tts_text_token
         model_input['text_len'] = tts_text_token_len
         return model_input

From 540d80ac2a37f86f0c79a758ff9ac66865198796 Mon Sep 17 00:00:00 2001
From: Clabiyau <40484679+Clabiyau@users.noreply.github.com>
Date: Thu, 24 Jul 2025 10:32:34 +0800
Subject: [PATCH 3/3] Update frontend.py

---
 cosyvoice/cli/frontend.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py
index fc0c3da1..288c38d3 100644
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -186,16 +186,16 @@ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_
     def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
         model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id)
         # in cross lingual mode, we remove prompt in llm
-        del model_input['prompt_text']
-        del model_input['prompt_text_len']
-        del model_input['llm_prompt_speech_token']
-        del model_input['llm_prompt_speech_token_len']
+        model_input.pop('prompt_text', None)
+        model_input.pop('prompt_text_len', None)
+        model_input.pop('llm_prompt_speech_token', None)
+        model_input.pop('llm_prompt_speech_token_len', None)
         return model_input
 
     def frontend_instruct(self, tts_text, spk_id, instruct_text):
         model_input = self.frontend_sft(tts_text, spk_id)
         # in instruct mode, we remove spk_embedding in llm due to information leakage
-        del model_input['llm_embedding']
+        model_input.pop('llm_embedding', None)
         instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
         model_input['prompt_text'] = instruct_text_token
         model_input['prompt_text_len'] = instruct_text_token_len
@@ -203,8 +203,8 @@ def frontend_instruct(self, tts_text, spk_id, instruct_text):
 
     def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
         model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id)
-        del model_input['llm_prompt_speech_token']
-        del model_input['llm_prompt_speech_token_len']
+        model_input.pop('llm_prompt_speech_token', None)
+        model_input.pop('llm_prompt_speech_token_len', None)
         return model_input
 
     def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):