mofa-org · ymote · Apr 24, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
diff --git a/mofa-fm/Cargo.lock b/mofa-fm/Cargo.lock
diff --git a/mofa-fm/Cargo.toml b/mofa-fm/Cargo.toml
@@ -13,4 +13,3 @@ path = "src/main.rs"
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 reqwest = { version = "0.12", features = ["blocking", "rustls-tls", "json", "multipart"], default-features = false }
-base64 = "0.22"
diff --git a/mofa-fm/SKILL.md b/mofa-fm/SKILL.md
@@ -1,25 +1,40 @@
 ---
 name: mofa-fm
-description: TTS and voice cloning. Triggers: voice, TTS, text to speech, 语音, 播报, read aloud.
+description: TTS and voice cloning. Triggers: voice, TTS, text to speech, clone voice, 语音, 克隆声音, 播报, read aloud.
 version: 0.4.3
 author: hagency
-always: false
+always: true
 ---
 
-# MoFA FM — Text-to-Speech
+# MoFA FM — Text-to-Speech and Voice Cloning
 
-## How to use
+## Workflow
 
-1. Call `fm_tts` directly with the full text. It runs in background and delivers the audio automatically.
-2. Do NOT use spawn, shell scripts, or manual text splitting.
-3. Call `fm_voice_list` before TTS to check available voices (preset + custom).
+The skill exposes two distinct capabilities. Pick the right one for the user's request:
+
+- **Synthesize with an existing voice** → call `fm_tts` directly. Works for preset voices and any custom voice already saved via `fm_voice_save`.
+- **Clone a new voice from an audio clip** (e.g. "克隆这个语音并命名为 X", "use this clip as a new voice") → call `fm_voice_save` FIRST to register the voice with the TTS server, then call `fm_tts` with that name.
+
+Never call `fm_tts` with a brand-new voice name and expect cloning to happen automatically — it will fail with "voice 'X' is not registered on ominix-api". The fixed sequence is:
+
+```
+user uploads wav  →  fm_voice_save(name, audio_path[, transcript])  →  fm_tts(voice=name, text=...)
+```
+
+`fm_voice_save` runs a full VITS fine-tune on the server and may take several minutes per voice. The call blocks until training completes; do not retry on perceived hang.
+
+## Rules
+
+1. Call `fm_voice_list` before TTS to check available voices (preset + saved custom).
+2. Call `fm_tts` directly with the full text. It runs in background and delivers the audio automatically.
+3. Do NOT use spawn, shell scripts, or manual text splitting.
 
 ## Voices
 
 Preset: vivian (default), serena, ryan, aiden, eric, dylan, uncle_fu, ono_anna, sohee
 
-Custom voices are saved via `fm_voice_save` and used by name in `fm_tts`.
-`fm_voice_save` accepts a short reference clip in WAV directly, or MP3/M4A/OGG/FLAC which will be converted to WAV before saving.
+Custom voices are registered via `fm_voice_save` and used by name in `fm_tts`.
+`fm_voice_save` accepts a short reference clip in WAV directly, or MP3/M4A/OGG/FLAC which will be converted to WAV before training.
 
 ## Style prompt
 

diff --git a/mofa-fm/manifest.json b/mofa-fm/manifest.json
@@ -56,7 +56,7 @@
       "name": "fm_tts",
       "spawn_only": true,
       "spawn_only_message": "SUCCESS: Audio generation is now running in background. The audio file will be automatically delivered to the user when ready. No further action needed for this request.",
-      "description": "Synthesize speech from text. Call this tool directly with the full text — it runs in background automatically and the audio file is delivered to the user when ready. Do NOT use shell scripts or manual splitting. Supports long text, preset and custom voices.",
+      "description": "Synthesize speech from text using a preset voice or a voice that has already been registered via fm_voice_save. To clone a new voice from a sample clip (克隆), call fm_voice_save FIRST to register the speaker — fm_tts itself does not create or register voices. Call this tool directly with the full text — it runs in background and the audio file is delivered to the user when ready. Do NOT use shell scripts or manual splitting. Supports long text, preset and saved custom voices.",
       "input_schema": {
         "type": "object",
         "properties": {
@@ -92,17 +92,25 @@
     },
     {
       "name": "fm_voice_save",
-      "description": "Save an audio file as a named custom voice. ALWAYS call fm_voice_list first to check if the voice already exists — do NOT re-save existing voices. Only save new voices from user-provided audio clips (3-10 seconds of clear speech).",
+      "description": "Clone (克隆) a new voice from a reference audio clip and register it with the TTS server so it can be used by fm_tts. MUST be called BEFORE fm_tts whenever the user asks to clone a voice or use a non-preset voice. Audio should be 3-10 seconds of clear speech from the target speaker (WAV preferred; MP3/M4A/OGG/FLAC are auto-converted). Once saved, pass the same name to fm_tts to synthesize. ALWAYS call fm_voice_list first to check if the voice already exists — do NOT re-save existing voices. This tool runs the full VITS fine-tune on the server and may take several minutes; the call blocks until training completes.",
       "input_schema": {
         "type": "object",
         "properties": {
           "name": {
             "type": "string",
-            "description": "Name for the voice (alphanumeric and underscores only, e.g. 'my_voice', 'boss')"
+            "description": "Name for the voice (alphanumeric and underscores only, e.g. 'my_voice', 'boss'). Pass this same name to fm_tts after saving."
           },
           "audio_path": {
             "type": "string",
             "description": "Absolute path to the audio file to use as voice reference. Supports WAV directly and auto-converts MP3/M4A/OGG/FLAC to WAV before saving."
+          },
+          "transcript": {
+            "type": "string",
+            "description": "Transcript of what is said in the reference audio. Required by the TTS server for high-quality cloning. Provide the exact text spoken in the clip in the clip's language. If unknown, pass a short placeholder phrase in the target language (training will still complete but quality may degrade)."
+          },
+          "language": {
+            "type": "string",
+            "description": "Language hint for the reference audio: 'zh' (default), 'en', 'ja', 'ko'."
           }
         },
         "required": [