Skip to content

Commit d9cb31d

Browse files
committed
retry params, comfyui integration, pytorch2.9 support
1 parent b881bf7 commit d9cb31d

File tree

6 files changed

+253
-315
lines changed

6 files changed

+253
-315
lines changed

example_workflows/VoxCPM_example.json

Lines changed: 95 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,120 +1,100 @@
11
{
22
"id": "332f7ac5-c30f-44aa-991a-aff51b05038a",
33
"revision": 0,
4-
"last_node_id": 4,
5-
"last_link_id": 8,
4+
"last_node_id": 6,
5+
"last_link_id": 11,
66
"nodes": [
77
{
8-
"id": 1,
9-
"type": "SaveAudio",
8+
"id": 4,
9+
"type": "MarkdownNote",
1010
"pos": [
11-
-987.0810546875,
12-
-1058.6048583984375
11+
-992.263427734375,
12+
-871.535400390625
1313
],
1414
"size": [
15-
277.3636474609375,
16-
112
15+
311.074462890625,
16+
346.9256286621094
1717
],
1818
"flags": {},
19-
"order": 3,
19+
"order": 0,
2020
"mode": 0,
21-
"inputs": [
22-
{
23-
"name": "audio",
24-
"type": "AUDIO",
25-
"link": 1
26-
}
27-
],
21+
"inputs": [],
2822
"outputs": [],
29-
"properties": {
30-
"cnr_id": "comfy-core",
31-
"ver": "0.3.52",
32-
"Node name for S&R": "SaveAudio",
33-
"ue_properties": {
34-
"widget_ue_connectable": {
35-
"filename_prefix": true,
36-
"audioUI": true
37-
},
38-
"version": "7.0.1"
39-
}
40-
},
23+
"title": "Note",
24+
"properties": {},
4125
"widgets_values": [
42-
"audio/VoxCPM"
43-
]
26+
"# ComfyUI-VoxCPM\n\n**VoxCPM** a novel tokenizer-free TTS system for context-aware speech generation and true-to-life voice cloning.\n\n## Models\nThis node automatically downloads the required model files.\n\n| Model | Parameters | Hugging Face Link |\n|:---|:---:|:---|\n| VoxCPM-0.5B | 0.5B | [openbmb/VoxCPM-0.5B](https://huggingface.co/openbmb/VoxCPM-0.5B) |\n\n## Support \n\n- Don't know how to update PyTorch?\n- Need help with ComfyUI?\n- Need technical support?\n\n### Or do you just have questions? Then join the [@TokenDiffusion Hub](https://t.me/TokenDiff_hub) group\n\n### AI news [TokenDiffusion](https://t.me/TokenDiff)"
27+
],
28+
"color": "#233",
29+
"bgcolor": "#355"
4430
},
4531
{
46-
"id": 4,
32+
"id": 6,
4733
"type": "MarkdownNote",
4834
"pos": [
49-
-1834.9903564453125,
50-
-871.060546875
35+
-1840.9210205078125,
36+
-874.2627563476562
5137
],
5238
"size": [
53-
341.9835510253906,
54-
346.01654052734375
39+
360.7866516113281,
40+
348.9750061035156
5541
],
5642
"flags": {},
57-
"order": 0,
43+
"order": 1,
5844
"mode": 0,
5945
"inputs": [],
6046
"outputs": [],
61-
"title": "Note",
6247
"properties": {},
6348
"widgets_values": [
64-
"# ComfyUI-VoxCPM\n\n**VoxCPM** a novel tokenizer-free TTS system for context-aware speech generation and true-to-life voice cloning.\n\n## Models\nThis node automatically downloads the required model files.\n\n| Model | Parameters | Hugging Face Link |\n|:---|:---:|:---|\n| VoxCPM-0.5B | 0.5B | [openbmb/VoxCPM-0.5B](https://huggingface.co/openbmb/VoxCPM-0.5B) |\n\n## Support \n\n- Don't know how to update PyTorch?\n- Need help with ComfyUI?\n- Need technical support?\n\n### Or do you just have questions? Then join the [@TokenDiffusion Hub](https://t.me/TokenDiff_hub) group\n\n### AI news [TokenDiffusion](https://t.me/TokenDiff)"
49+
"## 🎤 Achieving High-Quality Voice Clones\n\nTo achieve the best voice cloning results, providing an accurate `prompt_text` is **critical**. This text acts as a transcript that aligns the sound of the `prompt_audio` with the words being spoken, teaching the model the speaker's unique vocal characteristics.\n\n### How to Use `prompt_text` Effectively\n\n#### 1. **Provide a Verbatim Transcript**\nThe `prompt_text` must be a word-for-word transcript of the `prompt_audio`. Do not summarize or describe the audio.\n\n✅ **Correct:** `The quick brown fox jumps over the lazy dog.`\n\n❌ **Incorrect:** `A person saying a sentence about a fox.`\n\n#### 2. **Punctuation is Important**\nUse accurate punctuation to capture the speaker's intonation. The model learns how the speaker ends sentences, asks questions, or shows excitement.\n\n- **For a statement:** `This is a great example.`\n- **For a question:** `Is this a great example?`\n- **For excitement:** `This is a great example!`\n\n#### 3. **Match Audio and Text Length**\nThe audio clip should be long enough to capture the speaker's natural pacing and rhythm.\n\n👍 **Good:** A 5-15 second clip of continuous, clear speech.\n\n👌 **Okay:** A 3-5 second clip.\n\n⚠️ **Warning:** Very short clips (< 3 seconds) may result in a less stable or robotic-sounding clone.\n\n\n> **Note:**\n> For the best results, use a clean, high-quality `prompt_audio` with minimal background noise, reverb, or music. The model will try to clone the *entire* acoustic environment, not just the voice."
6550
],
66-
"color": "#233",
67-
"bgcolor": "#355"
51+
"color": "#432",
52+
"bgcolor": "#653"
6853
},
6954
{
70-
"id": 2,
71-
"type": "LoadAudio",
55+
"id": 1,
56+
"type": "SaveAudio",
7257
"pos": [
73-
-1843.4453125,
74-
-1058.6048583984375
58+
-991.1134643554688,
59+
-1058.956787109375
7560
],
7661
"size": [
77-
350.4437255859375,
78-
136
62+
277.3636474609375,
63+
112
7964
],
8065
"flags": {},
81-
"order": 1,
66+
"order": 4,
8267
"mode": 0,
83-
"inputs": [],
84-
"outputs": [
68+
"inputs": [
8569
{
86-
"name": "AUDIO",
70+
"name": "audio",
8771
"type": "AUDIO",
88-
"links": [
89-
8
90-
]
72+
"link": 10
9173
}
9274
],
75+
"outputs": [],
9376
"properties": {
77+
"Node name for S&R": "SaveAudio",
9478
"cnr_id": "comfy-core",
9579
"ver": "0.3.52",
96-
"Node name for S&R": "LoadAudio",
9780
"ue_properties": {
9881
"widget_ue_connectable": {
99-
"audio": true,
100-
"audioUI": true,
101-
"upload": true
82+
"filename_prefix": true,
83+
"audioUI": true
10284
},
10385
"version": "7.0.1"
10486
}
10587
},
10688
"widgets_values": [
107-
"male_stewie.mp3",
108-
null,
109-
null
89+
"audio/VoxCPM"
11090
]
11191
},
11292
{
113-
"id": 3,
93+
"id": 5,
11494
"type": "VoxCPM_TTS",
11595
"pos": [
11696
-1438.1444091796875,
117-
-1058.6048583984375
97+
-1058.956787109375
11898
],
11999
"size": [
120100
399.0082702636719,
@@ -128,66 +108,98 @@
128108
"name": "prompt_audio",
129109
"shape": 7,
130110
"type": "AUDIO",
131-
"link": 8
111+
"link": null
132112
}
133113
],
134114
"outputs": [
135115
{
136116
"name": "Generated Audio",
137117
"type": "AUDIO",
138118
"links": [
139-
1
119+
10
140120
]
141121
}
142122
],
143123
"properties": {
144-
"cnr_id": "ComfyUI-VoxCPM",
145-
"ver": "e9845ede9bc6ad1febd3a9b5104a040315a12d50",
146-
"Node name for S&R": "VoxCPM_TTS",
147-
"aux_id": "wildminder/ComfyUI-VoxCPM"
124+
"Node name for S&R": "VoxCPM_TTS"
148125
},
149126
"widgets_values": [
150127
"VoxCPM-0.5B",
151128
"You're baking with ingredients from the Realm of Madness?! I am… impressed. And deeply concerned.",
152129
2,
153130
20,
154131
true,
155-
779450847354053,
132+
779450847354054,
156133
"fixed",
157134
false,
158-
"cpu",
159-
"angry kid"
135+
"cuda",
136+
3,
137+
5,
138+
""
160139
],
161140
"color": "#232",
162141
"bgcolor": "#353"
142+
},
143+
{
144+
"id": 2,
145+
"type": "LoadAudio",
146+
"pos": [
147+
-1843.4453125,
148+
-1058.956787109375
149+
],
150+
"size": [
151+
350.4437255859375,
152+
136
153+
],
154+
"flags": {},
155+
"order": 3,
156+
"mode": 0,
157+
"inputs": [],
158+
"outputs": [
159+
{
160+
"name": "AUDIO",
161+
"type": "AUDIO",
162+
"links": []
163+
}
164+
],
165+
"properties": {
166+
"Node name for S&R": "LoadAudio",
167+
"cnr_id": "comfy-core",
168+
"ver": "0.3.52",
169+
"ue_properties": {
170+
"widget_ue_connectable": {
171+
"audio": true,
172+
"audioUI": true,
173+
"upload": true
174+
},
175+
"version": "7.0.1"
176+
}
177+
},
178+
"widgets_values": [
179+
"male_stewie.mp3",
180+
null,
181+
null
182+
]
163183
}
164184
],
165185
"links": [
166186
[
167-
1,
168-
3,
187+
10,
188+
5,
169189
0,
170190
1,
171191
0,
172192
"AUDIO"
173-
],
174-
[
175-
8,
176-
2,
177-
0,
178-
3,
179-
0,
180-
"AUDIO"
181193
]
182194
],
183195
"groups": [],
184196
"config": {},
185197
"extra": {
186198
"ds": {
187-
"scale": 1.2100000000000006,
199+
"scale": 1.1,
188200
"offset": [
189201
1943.4453125,
190-
1158.6048583984375
202+
1158.956787109375
191203
]
192204
},
193205
"frontendVersion": "1.26.11",
7.83 KB
Loading

requirements.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,4 @@ sortedcontainers
1313
soundfile
1414
diffusers
1515
tqdm
16-
bitsandbytes
17-
torchcodec
16+
bitsandbytes

src/voxcpm/core.py

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import torch
22
import os
3-
import tempfile
43
import logging
54
from huggingface_hub import snapshot_download
65
from .model.voxcpm import VoxCPMModel
@@ -60,18 +59,21 @@ def from_pretrained(cls,
6059

6160
return cls(voxcpm_model_path=local_path)
6261

63-
def generate(self,
64-
text : str,
65-
prompt_wav_path : str = None,
66-
prompt_text : str = None,
67-
cfg_value : float = 2.0,
68-
inference_timesteps : int = 10,
69-
max_length : int = 4096,
70-
normalize : bool = True,
71-
retry_badcase : bool = True,
72-
retry_badcase_max_times : int = 3,
73-
retry_badcase_ratio_threshold : float = 6.0,
74-
):
62+
def generate(
63+
self,
64+
text : str,
65+
prompt_wav_path : str = None,
66+
prompt_waveform: torch.Tensor = None,
67+
prompt_sample_rate: int = None,
68+
prompt_text : str = None,
69+
cfg_value : float = 2.0,
70+
inference_timesteps : int = 10,
71+
max_length : int = 4096,
72+
normalize : bool = True,
73+
retry_badcase : bool = True,
74+
retry_badcase_max_times : int = 3,
75+
retry_badcase_ratio_threshold : float = 6.0,
76+
):
7577
"""Synthesize speech for the given text and return a single waveform.
7678
7779
This method optionally builds and reuses a prompt cache. If an external
@@ -100,13 +102,18 @@ def generate(self,
100102
texts = [t.strip() for t in texts if t.strip()]
101103
final_wav = []
102104

103-
if prompt_wav_path is not None and prompt_text is not None:
105+
# Check for either waveform or path for cloning
106+
is_cloning = prompt_waveform is not None or prompt_wav_path is not None
107+
if is_cloning and prompt_text:
104108
fixed_prompt_cache = self.tts_model.build_prompt_cache(
109+
prompt_text=prompt_text,
105110
prompt_wav_path=prompt_wav_path,
106-
prompt_text=prompt_text
111+
prompt_waveform=prompt_waveform,
112+
prompt_sample_rate=prompt_sample_rate
107113
)
108114
else:
109-
fixed_prompt_cache = None # will be built from the first inference
115+
# will be built from the first inference
116+
fixed_prompt_cache = None
110117

111118
for i, sub_text in enumerate(texts):
112119
if sub_text.strip() == "":

0 commit comments

Comments
 (0)