wildminder
diff --git a/‎example_workflows/VoxCPM_example.json‎
Lines changed: 95 additions & 83 deletions b/‎example_workflows/VoxCPM_example.json‎
Lines changed: 95 additions & 83 deletions
diff --git a/‎example_workflows/VoxCPM_example.png‎
7.83 KB b/‎example_workflows/VoxCPM_example.png‎
7.83 KB
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 2 deletions b/‎requirements.txt‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/voxcpm/core.py‎
Lines changed: 23 additions & 16 deletions b/‎src/voxcpm/core.py‎
Lines changed: 23 additions & 16 deletions
@@ -1,120 +1,100 @@
 {
   "id": "332f7ac5-c30f-44aa-991a-aff51b05038a",
   "revision": 0,
-  "last_node_id": 4,
-  "last_link_id": 8,
+  "last_node_id": 6,
+  "last_link_id": 11,
   "nodes": [
     {
-      "id": 1,
-      "type": "SaveAudio",
+      "id": 4,
+      "type": "MarkdownNote",
       "pos": [
-        -987.0810546875,
-        -1058.6048583984375
+        -992.263427734375,
+        -871.535400390625
       ],
       "size": [
-        277.3636474609375,
-        112
+        311.074462890625,
+        346.9256286621094
       ],
       "flags": {},
-      "order": 3,
+      "order": 0,
       "mode": 0,
-      "inputs": [
-        {
-          "name": "audio",
-          "type": "AUDIO",
-          "link": 1
-        }
-      ],
+      "inputs": [],
       "outputs": [],
-      "properties": {
-        "cnr_id": "comfy-core",
-        "ver": "0.3.52",
-        "Node name for S&R": "SaveAudio",
-        "ue_properties": {
-          "widget_ue_connectable": {
-            "filename_prefix": true,
-            "audioUI": true
-          },
-          "version": "7.0.1"
-        }
-      },
+      "title": "Note",
+      "properties": {},
       "widgets_values": [
-        "audio/VoxCPM"
-      ]
+        "# ComfyUI-VoxCPM\n\n**VoxCPM**  a novel tokenizer-free TTS system for context-aware speech generation and true-to-life voice cloning.\n\n## Models\nThis node automatically downloads the required model files.\n\n| Model | Parameters | Hugging Face Link |\n|:---|:---:|:---|\n| VoxCPM-0.5B | 0.5B | [openbmb/VoxCPM-0.5B](https://huggingface.co/openbmb/VoxCPM-0.5B) |\n\n## Support \n\n- Don't know how to update PyTorch?\n- Need help with ComfyUI?\n- Need technical support?\n\n### Or do you just have questions? Then join the [@TokenDiffusion Hub](https://t.me/TokenDiff_hub) group\n\n### AI news [TokenDiffusion](https://t.me/TokenDiff)"
+      ],
+      "color": "#233",
+      "bgcolor": "#355"
     },
     {
-      "id": 4,
+      "id": 6,
       "type": "MarkdownNote",
       "pos": [
-        -1834.9903564453125,
-        -871.060546875
+        -1840.9210205078125,
+        -874.2627563476562
       ],
       "size": [
-        341.9835510253906,
-        346.01654052734375
+        360.7866516113281,
+        348.9750061035156
       ],
       "flags": {},
-      "order": 0,
+      "order": 1,
       "mode": 0,
       "inputs": [],
       "outputs": [],
-      "title": "Note",
       "properties": {},
       "widgets_values": [
-        "# ComfyUI-VoxCPM\n\n**VoxCPM**  a novel tokenizer-free TTS system for context-aware speech generation and true-to-life voice cloning.\n\n## Models\nThis node automatically downloads the required model files.\n\n| Model | Parameters | Hugging Face Link |\n|:---|:---:|:---|\n| VoxCPM-0.5B | 0.5B | [openbmb/VoxCPM-0.5B](https://huggingface.co/openbmb/VoxCPM-0.5B) |\n\n## Support \n\n- Don't know how to update PyTorch?\n- Need help with ComfyUI?\n- Need technical support?\n\n### Or do you just have questions? Then join the [@TokenDiffusion Hub](https://t.me/TokenDiff_hub) group\n\n### AI news [TokenDiffusion](https://t.me/TokenDiff)"
+        "## 🎤 Achieving High-Quality Voice Clones\n\nTo achieve the best voice cloning results, providing an accurate `prompt_text` is **critical**. This text acts as a transcript that aligns the sound of the `prompt_audio` with the words being spoken, teaching the model the speaker's unique vocal characteristics.\n\n### How to Use `prompt_text` Effectively\n\n#### 1. **Provide a Verbatim Transcript**\nThe `prompt_text` must be a word-for-word transcript of the `prompt_audio`. Do not summarize or describe the audio.\n\n✅ **Correct:** `The quick brown fox jumps over the lazy dog.`\n\n❌ **Incorrect:** `A person saying a sentence about a fox.`\n\n#### 2. **Punctuation is Important**\nUse accurate punctuation to capture the speaker's intonation. The model learns how the speaker ends sentences, asks questions, or shows excitement.\n\n-   **For a statement:** `This is a great example.`\n-   **For a question:** `Is this a great example?`\n-   **For excitement:** `This is a great example!`\n\n#### 3. **Match Audio and Text Length**\nThe audio clip should be long enough to capture the speaker's natural pacing and rhythm.\n\n👍 **Good:** A 5-15 second clip of continuous, clear speech.\n\n👌 **Okay:** A 3-5 second clip.\n\n⚠️ **Warning:** Very short clips (< 3 seconds) may result in a less stable or robotic-sounding clone.\n\n\n> **Note:**\n> For the best results, use a clean, high-quality `prompt_audio` with minimal background noise, reverb, or music. The model will try to clone the *entire* acoustic environment, not just the voice."
       ],
-      "color": "#233",
-      "bgcolor": "#355"
+      "color": "#432",
+      "bgcolor": "#653"
     },
     {
-      "id": 2,
-      "type": "LoadAudio",
+      "id": 1,
+      "type": "SaveAudio",
       "pos": [
-        -1843.4453125,
-        -1058.6048583984375
+        -991.1134643554688,
+        -1058.956787109375
       ],
       "size": [
-        350.4437255859375,
-        136
+        277.3636474609375,
+        112
       ],
       "flags": {},
-      "order": 1,
+      "order": 4,
       "mode": 0,
-      "inputs": [],
-      "outputs": [
+      "inputs": [
         {
-          "name": "AUDIO",
+          "name": "audio",
           "type": "AUDIO",
-          "links": [
-            8
-          ]
+          "link": 10
         }
       ],
+      "outputs": [],
       "properties": {
+        "Node name for S&R": "SaveAudio",
         "cnr_id": "comfy-core",
         "ver": "0.3.52",
-        "Node name for S&R": "LoadAudio",
         "ue_properties": {
           "widget_ue_connectable": {
-            "audio": true,
-            "audioUI": true,
-            "upload": true
+            "filename_prefix": true,
+            "audioUI": true
           },
           "version": "7.0.1"
         }
       },
       "widgets_values": [
-        "male_stewie.mp3",
-        null,
-        null
+        "audio/VoxCPM"
       ]
     },
     {
-      "id": 3,
+      "id": 5,
       "type": "VoxCPM_TTS",
       "pos": [
         -1438.1444091796875,
-        -1058.6048583984375
+        -1058.956787109375
       ],
       "size": [
         399.0082702636719,
@@ -128,66 +108,98 @@
           "name": "prompt_audio",
           "shape": 7,
           "type": "AUDIO",
-          "link": 8
+          "link": null
         }
       ],
       "outputs": [
         {
           "name": "Generated Audio",
           "type": "AUDIO",
           "links": [
-            1
+            10
           ]
         }
       ],
       "properties": {
-        "cnr_id": "ComfyUI-VoxCPM",
-        "ver": "e9845ede9bc6ad1febd3a9b5104a040315a12d50",
-        "Node name for S&R": "VoxCPM_TTS",
-        "aux_id": "wildminder/ComfyUI-VoxCPM"
+        "Node name for S&R": "VoxCPM_TTS"
       },
       "widgets_values": [
         "VoxCPM-0.5B",
         "You're baking with ingredients from the Realm of Madness?! I am… impressed. And deeply concerned.",
         2,
         20,
         true,
-        779450847354053,
+        779450847354054,
         "fixed",
         false,
-        "cpu",
-        "angry kid"
+        "cuda",
+        3,
+        5,
+        ""
       ],
       "color": "#232",
       "bgcolor": "#353"
+    },
+    {
+      "id": 2,
+      "type": "LoadAudio",
+      "pos": [
+        -1843.4453125,
+        -1058.956787109375
+      ],
+      "size": [
+        350.4437255859375,
+        136
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "AUDIO",
+          "type": "AUDIO",
+          "links": []
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadAudio",
+        "cnr_id": "comfy-core",
+        "ver": "0.3.52",
+        "ue_properties": {
+          "widget_ue_connectable": {
+            "audio": true,
+            "audioUI": true,
+            "upload": true
+          },
+          "version": "7.0.1"
+        }
+      },
+      "widgets_values": [
+        "male_stewie.mp3",
+        null,
+        null
+      ]
     }
   ],
   "links": [
     [
-      1,
-      3,
+      10,
+      5,
       0,
       1,
       0,
       "AUDIO"
-    ],
-    [
-      8,
-      2,
-      0,
-      3,
-      0,
-      "AUDIO"
     ]
   ],
   "groups": [],
   "config": {},
   "extra": {
     "ds": {
-      "scale": 1.2100000000000006,
+      "scale": 1.1,
       "offset": [
         1943.4453125,
-        1158.6048583984375
+        1158.956787109375
       ]
     },
     "frontendVersion": "1.26.11",
 
@@ -13,5 +13,4 @@ sortedcontainers
 soundfile
 diffusers
 tqdm
-bitsandbytes
-torchcodec
+bitsandbytes
@@ -1,6 +1,5 @@
 import torch
 import os
-import tempfile
 import logging
 from huggingface_hub import snapshot_download
 from .model.voxcpm import VoxCPMModel
@@ -60,18 +59,21 @@ def from_pretrained(cls,
 
         return cls(voxcpm_model_path=local_path)
 
-    def generate(self, 
-            text : str,
-            prompt_wav_path : str = None,
-            prompt_text : str = None,
-            cfg_value : float = 2.0,    
-            inference_timesteps : int = 10,
-            max_length : int = 4096,
-            normalize : bool = True,
-            retry_badcase : bool = True,
-            retry_badcase_max_times : int = 3,
-            retry_badcase_ratio_threshold : float = 6.0,
-        ):
+    def generate(
+        self, 
+        text : str,
+        prompt_wav_path : str = None,
+        prompt_waveform: torch.Tensor = None,
+        prompt_sample_rate: int = None,
+        prompt_text : str = None,
+        cfg_value : float = 2.0,    
+        inference_timesteps : int = 10,
+        max_length : int = 4096,
+        normalize : bool = True,
+        retry_badcase : bool = True,
+        retry_badcase_max_times : int = 3,
+        retry_badcase_ratio_threshold : float = 6.0,
+    ):
         """Synthesize speech for the given text and return a single waveform.
 
         This method optionally builds and reuses a prompt cache. If an external
@@ -100,13 +102,18 @@ def generate(self,
         texts = [t.strip() for t in texts if t.strip()]
         final_wav = []
 
-        if prompt_wav_path is not None and prompt_text is not None:
+        # Check for either waveform or path for cloning
+        is_cloning = prompt_waveform is not None or prompt_wav_path is not None
+        if is_cloning and prompt_text:
             fixed_prompt_cache = self.tts_model.build_prompt_cache(
+                prompt_text=prompt_text,
                 prompt_wav_path=prompt_wav_path,
-                prompt_text=prompt_text
+                prompt_waveform=prompt_waveform,
+                prompt_sample_rate=prompt_sample_rate
             )
         else:
-            fixed_prompt_cache = None  # will be built from the first inference
+            # will be built from the first inference
+            fixed_prompt_cache = None
 
         for i, sub_text in enumerate(texts):
             if sub_text.strip() == "":