diff --git a/CLAUDE.md b/CLAUDE.md index d94c228..a64c07d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -136,6 +136,9 @@ python tools/voiceover.py --scene-dir public/audio/scenes --json # Using Qwen3-TTS (self-hosted, free alternative to ElevenLabs) python tools/voiceover.py --provider qwen3 --tone warm --scene-dir public/audio/scenes --json +# Using MiniMax Cloud TTS (no GPU required, cloud API) +python tools/voiceover.py --provider minimax --minimax-voice English_Graceful_Lady --scene-dir public/audio/scenes --json + # Single file (legacy) python tools/voiceover.py --script SCRIPT.md --output out.mp3 ``` @@ -163,6 +166,17 @@ python tools/qwen3_tts.py --list-tones # neutral, warm, professional, excited Temperature controls expressiveness: `--temperature 1.2` (more expressive) or `--temperature 0.4` (more consistent). +### MiniMax Cloud TTS (Standalone) + +```bash +python tools/minimax_tts.py --text "Hello world" --output hello.mp3 +python tools/minimax_tts.py --text "Hello world" --voice English_Persuasive_Man --output hello.mp3 +python tools/minimax_tts.py --text "Hello world" --model turbo --output fast.mp3 +python tools/minimax_tts.py --list-voices # 12 voices: English + Chinese +``` + +Two models: `hd` (speech-2.8-hd, high quality) and `turbo` (speech-2.8-turbo, faster). No GPU required — runs entirely in the cloud via MiniMax API. + ### Cloud GPU Providers All cloud GPU tools support two providers via `--cloud runpod|modal`. RunPod is the default. Modal was added as a reliability fallback after RunPod outages, and offers faster cold starts. diff --git a/README.md b/README.md index 87fcef9..cff5d4f 100644 --- a/README.md +++ b/README.md @@ -173,6 +173,10 @@ python tools/voiceover.py --script script.md --output voiceover.mp3 python tools/voiceover.py --provider qwen3 --speaker Ryan --scene-dir public/audio/scenes --json python tools/qwen3_tts.py --text "Hello world" --tone warm --output hello.mp3 +# Generate voiceover (MiniMax Cloud TTS — no GPU required) +python tools/voiceover.py --provider minimax --minimax-voice English_Persuasive_Man --scene-dir public/audio/scenes --json +python tools/minimax_tts.py --text "Hello world" --voice English_Graceful_Lady --output hello.mp3 + # Generate background music (ElevenLabs) python tools/music.py --prompt "Upbeat corporate" --duration 120 --output music.mp3 diff --git a/brands/default/voice.json b/brands/default/voice.json index a0a9396..27f71f9 100644 --- a/brands/default/voice.json +++ b/brands/default/voice.json @@ -14,5 +14,9 @@ "tone": "", "instruct": "", "clone": null + }, + "minimax": { + "voice": "English_Graceful_Lady", + "model": "hd" } } diff --git a/tests/test_minimax_tts.py b/tests/test_minimax_tts.py new file mode 100644 index 0000000..dc2df5d --- /dev/null +++ b/tests/test_minimax_tts.py @@ -0,0 +1,499 @@ +#!/usr/bin/env python3 +""" +Unit tests for MiniMax TTS provider integration. + +Tests cover: +- minimax_tts.py standalone module (generate_audio, voice/model validation) +- voiceover.py MiniMax provider integration (CLI args, dry-run, brand config) +- config.py get_minimax_api_key() +""" + +import json +import os +import sys +import tempfile +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + +# Add tools/ to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent / "tools")) + + +class TestMiniMaxTTSConstants(unittest.TestCase): + """Test minimax_tts.py constants and configuration.""" + + def test_voices_dict_not_empty(self): + from minimax_tts import MINIMAX_VOICES + + self.assertGreater(len(MINIMAX_VOICES), 0) + + def test_english_voices_exist(self): + from minimax_tts import MINIMAX_VOICES + + english_voices = [v for v, lang in MINIMAX_VOICES.items() if lang == "English"] + self.assertGreaterEqual(len(english_voices), 5) + + def test_chinese_voices_exist(self): + from minimax_tts import MINIMAX_VOICES + + chinese_voices = [v for v, lang in MINIMAX_VOICES.items() if lang == "Chinese"] + self.assertGreaterEqual(len(chinese_voices), 7) + + def test_default_voice_is_valid(self): + from minimax_tts import DEFAULT_VOICE, MINIMAX_VOICES + + self.assertIn(DEFAULT_VOICE, MINIMAX_VOICES) + + def test_models_dict(self): + from minimax_tts import MINIMAX_TTS_MODELS + + self.assertIn("hd", MINIMAX_TTS_MODELS) + self.assertIn("turbo", MINIMAX_TTS_MODELS) + self.assertEqual(MINIMAX_TTS_MODELS["hd"], "speech-2.8-hd") + self.assertEqual(MINIMAX_TTS_MODELS["turbo"], "speech-2.8-turbo") + + def test_api_url(self): + from minimax_tts import MINIMAX_TTS_API_URL + + self.assertEqual(MINIMAX_TTS_API_URL, "https://api.minimax.io/v1/t2a_v2") + + def test_default_model(self): + from minimax_tts import DEFAULT_MODEL + + self.assertEqual(DEFAULT_MODEL, "hd") + + +class TestMiniMaxTTSGenerateAudio(unittest.TestCase): + """Test minimax_tts.generate_audio() function.""" + + @patch("minimax_tts.requests.post") + @patch.dict(os.environ, {"MINIMAX_API_KEY": "test_key_123"}) + def test_successful_generation(self, mock_post): + from minimax_tts import generate_audio + + # Create mock response with hex-encoded MP3 data + fake_audio_hex = b"fake_audio_bytes".hex() + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "base_resp": {"status_code": 0, "status_msg": "success"}, + "data": {"audio": fake_audio_hex}, + } + mock_post.return_value = mock_response + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test_output.mp3") + result = generate_audio( + text="Hello world", + output_path=output_path, + voice="English_Graceful_Lady", + model="hd", + verbose=False, + ) + + self.assertTrue(result["success"]) + self.assertEqual(result["output"], output_path) + self.assertEqual(result["script_chars"], 11) + self.assertTrue(Path(output_path).exists()) + + @patch("minimax_tts.requests.post") + @patch.dict(os.environ, {"MINIMAX_API_KEY": "test_key_123"}) + def test_api_error_response(self, mock_post): + from minimax_tts import generate_audio + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "base_resp": {"status_code": 1001, "status_msg": "Invalid API key"}, + "data": {}, + } + mock_post.return_value = mock_response + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test_output.mp3") + result = generate_audio( + text="Hello", + output_path=output_path, + verbose=False, + ) + + self.assertFalse(result["success"]) + self.assertIn("1001", result["error"]) + + @patch("minimax_tts.requests.post") + @patch.dict(os.environ, {"MINIMAX_API_KEY": "test_key_123"}) + def test_http_error(self, mock_post): + from minimax_tts import generate_audio + + mock_response = MagicMock() + mock_response.status_code = 500 + mock_response.text = "Internal Server Error" + mock_post.return_value = mock_response + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test_output.mp3") + result = generate_audio( + text="Hello", + output_path=output_path, + verbose=False, + ) + + self.assertFalse(result["success"]) + self.assertIn("500", result["error"]) + + @patch.dict(os.environ, {}, clear=True) + def test_missing_api_key(self): + from minimax_tts import generate_audio + + # Remove MINIMAX_API_KEY from env + os.environ.pop("MINIMAX_API_KEY", None) + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test_output.mp3") + result = generate_audio( + text="Hello", + output_path=output_path, + verbose=False, + ) + + self.assertFalse(result["success"]) + self.assertIn("MINIMAX_API_KEY", result["error"]) + + @patch("minimax_tts.requests.post") + @patch.dict(os.environ, {"MINIMAX_API_KEY": "test_key_123"}) + def test_no_audio_in_response(self, mock_post): + from minimax_tts import generate_audio + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "base_resp": {"status_code": 0, "status_msg": "success"}, + "data": {}, + } + mock_post.return_value = mock_response + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test_output.mp3") + result = generate_audio( + text="Hello", + output_path=output_path, + verbose=False, + ) + + self.assertFalse(result["success"]) + self.assertIn("No audio", result["error"]) + + @patch("minimax_tts.requests.post") + @patch.dict(os.environ, {"MINIMAX_API_KEY": "test_key_123"}) + def test_request_payload_format(self, mock_post): + from minimax_tts import generate_audio + + fake_audio_hex = b"audio".hex() + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "base_resp": {"status_code": 0}, + "data": {"audio": fake_audio_hex}, + } + mock_post.return_value = mock_response + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test.mp3") + generate_audio( + text="Test text", + output_path=output_path, + voice="English_Persuasive_Man", + model="turbo", + speed=1.2, + volume=0.8, + pitch=-2, + verbose=False, + ) + + # Verify the API was called with correct payload + call_args = mock_post.call_args + payload = call_args[1]["json"] if "json" in call_args[1] else call_args[0][1] + + self.assertEqual(payload["model"], "speech-2.8-turbo") + self.assertEqual(payload["text"], "Test text") + self.assertEqual(payload["voice_setting"]["voice_id"], "English_Persuasive_Man") + self.assertEqual(payload["voice_setting"]["speed"], 1.2) + self.assertEqual(payload["voice_setting"]["vol"], 0.8) + self.assertEqual(payload["voice_setting"]["pitch"], -2) + self.assertEqual(payload["audio_setting"]["format"], "mp3") + + @patch("minimax_tts.requests.post") + @patch.dict(os.environ, {"MINIMAX_API_KEY": "test_key_123"}) + def test_authorization_header(self, mock_post): + from minimax_tts import generate_audio + + fake_audio_hex = b"audio".hex() + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "base_resp": {"status_code": 0}, + "data": {"audio": fake_audio_hex}, + } + mock_post.return_value = mock_response + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test.mp3") + generate_audio(text="Hi", output_path=output_path, verbose=False) + + call_args = mock_post.call_args + headers = call_args[1]["headers"] + self.assertEqual(headers["Authorization"], "Bearer test_key_123") + self.assertEqual(headers["Content-Type"], "application/json") + + @patch("minimax_tts.requests.post") + @patch.dict(os.environ, {"MINIMAX_API_KEY": "test_key_123"}) + def test_timeout_handling(self, mock_post): + import requests as req + from minimax_tts import generate_audio + + mock_post.side_effect = req.exceptions.Timeout("Connection timed out") + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test.mp3") + result = generate_audio( + text="Hello", + output_path=output_path, + verbose=False, + ) + + self.assertFalse(result["success"]) + self.assertIn("timed out", result["error"]) + + @patch("minimax_tts.requests.post") + @patch.dict(os.environ, {"MINIMAX_API_KEY": "test_key_123"}) + def test_output_directory_creation(self, mock_post): + from minimax_tts import generate_audio + + fake_audio_hex = b"audio_data".hex() + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "base_resp": {"status_code": 0}, + "data": {"audio": fake_audio_hex}, + } + mock_post.return_value = mock_response + + with tempfile.TemporaryDirectory() as tmpdir: + nested_path = os.path.join(tmpdir, "nested", "dir", "output.mp3") + result = generate_audio( + text="Hello", + output_path=nested_path, + verbose=False, + ) + + self.assertTrue(result["success"]) + self.assertTrue(Path(nested_path).exists()) + + +class TestConfigMiniMaxAPIKey(unittest.TestCase): + """Test config.py get_minimax_api_key().""" + + @patch.dict(os.environ, {"MINIMAX_API_KEY": "my_key_abc"}) + def test_returns_key_from_env(self): + from config import get_minimax_api_key + + self.assertEqual(get_minimax_api_key(), "my_key_abc") + + @patch.dict(os.environ, {}, clear=True) + def test_returns_none_when_missing(self): + os.environ.pop("MINIMAX_API_KEY", None) + from config import get_minimax_api_key + + result = get_minimax_api_key() + self.assertIsNone(result) + + +class TestVoiceoverMiniMaxCLI(unittest.TestCase): + """Test voiceover.py CLI argument parsing for MiniMax provider.""" + + def test_minimax_provider_accepted(self): + from voiceover import parse_args + + with patch("sys.argv", [ + "voiceover.py", + "--provider", "minimax", + "--script", "test.txt", + "--output", "out.mp3", + ]): + args = parse_args() + self.assertEqual(args.provider, "minimax") + + def test_minimax_voice_default(self): + from voiceover import parse_args + + with patch("sys.argv", [ + "voiceover.py", + "--provider", "minimax", + "--script", "test.txt", + "--output", "out.mp3", + ]): + args = parse_args() + self.assertEqual(args.minimax_voice, "English_Graceful_Lady") + + def test_minimax_voice_custom(self): + from voiceover import parse_args + + with patch("sys.argv", [ + "voiceover.py", + "--provider", "minimax", + "--minimax-voice", "English_Persuasive_Man", + "--script", "test.txt", + "--output", "out.mp3", + ]): + args = parse_args() + self.assertEqual(args.minimax_voice, "English_Persuasive_Man") + + def test_minimax_model_choices(self): + from voiceover import parse_args + + for model_choice in ["hd", "turbo"]: + with patch("sys.argv", [ + "voiceover.py", + "--provider", "minimax", + "--minimax-model", model_choice, + "--script", "test.txt", + "--output", "out.mp3", + ]): + args = parse_args() + self.assertEqual(args.minimax_model, model_choice) + + def test_minimax_volume_and_pitch(self): + from voiceover import parse_args + + with patch("sys.argv", [ + "voiceover.py", + "--provider", "minimax", + "--volume", "2.0", + "--pitch", "3", + "--script", "test.txt", + "--output", "out.mp3", + ]): + args = parse_args() + self.assertEqual(args.volume, 2.0) + self.assertEqual(args.pitch, 3) + + +class TestVoiceoverMiniMaxDryRun(unittest.TestCase): + """Test voiceover.py dry-run output for MiniMax provider.""" + + @patch.dict(os.environ, {"MINIMAX_API_KEY": "test_key"}) + def test_single_file_dry_run(self): + from voiceover import parse_args + + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f: + f.write("Test script content") + script_path = f.name + + try: + import io + from contextlib import redirect_stdout + + with patch("sys.argv", [ + "voiceover.py", + "--provider", "minimax", + "--script", script_path, + "--output", "out.mp3", + "--dry-run", + "--json", + ]): + captured = io.StringIO() + with redirect_stdout(captured): + from voiceover import main + from dotenv import load_dotenv + load_dotenv() + main() + + output = captured.getvalue() + result = json.loads(output) + self.assertTrue(result["dry_run"]) + self.assertEqual(result["provider"], "minimax") + self.assertEqual(result["voice"], "English_Graceful_Lady") + self.assertEqual(result["model"], "hd") + finally: + os.unlink(script_path) + + +class TestVoiceoverMiniMaxBrand(unittest.TestCase): + """Test voiceover.py brand config resolution for MiniMax.""" + + def test_brand_voice_config_with_minimax(self): + """Verify that voice.json with minimax section can be loaded.""" + from config import load_brand_voice_config + + config = load_brand_voice_config("default") + if config: + self.assertIn("minimax", config) + minimax_cfg = config["minimax"] + self.assertIn("voice", minimax_cfg) + self.assertIn("model", minimax_cfg) + + +class TestMiniMaxTTSGetAudioDuration(unittest.TestCase): + """Test audio duration helper.""" + + def test_nonexistent_file(self): + from minimax_tts import get_audio_duration + + result = get_audio_duration("/nonexistent/path.mp3") + self.assertIsNone(result) + + +class TestGenerateSingleAudioMiniMax(unittest.TestCase): + """Test voiceover.py generate_single_audio_minimax wrapper.""" + + @patch("minimax_tts.generate_audio") + def test_delegates_to_minimax_tts(self, mock_gen): + mock_gen.return_value = {"success": True, "output": "/tmp/test.mp3"} + + from voiceover import generate_single_audio_minimax + + result = generate_single_audio_minimax( + script="Hello world", + output_path=Path("/tmp/test.mp3"), + voice="English_Persuasive_Man", + model="turbo", + speed=1.5, + volume=2.0, + pitch=3, + ) + + mock_gen.assert_called_once_with( + text="Hello world", + output_path="/tmp/test.mp3", + voice="English_Persuasive_Man", + model="turbo", + speed=1.5, + volume=2.0, + pitch=3, + verbose=False, + ) + + +class TestMiniMaxTTSListVoices(unittest.TestCase): + """Test minimax_tts.py --list-voices CLI.""" + + def test_list_voices_output(self): + import subprocess + + result = subprocess.run( + [sys.executable, "-m", "minimax_tts", "--list-voices"], + capture_output=True, + text=True, + cwd=str(Path(__file__).parent.parent / "tools"), + ) + + self.assertEqual(result.returncode, 0) + self.assertIn("English_Graceful_Lady", result.stdout) + self.assertIn("English_Persuasive_Man", result.stdout) + self.assertIn("Deep_Voice_Man", result.stdout) + self.assertIn("speech-2.8-hd", result.stdout) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_minimax_tts_integration.py b/tests/test_minimax_tts_integration.py new file mode 100644 index 0000000..1633830 --- /dev/null +++ b/tests/test_minimax_tts_integration.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +""" +Integration tests for MiniMax TTS provider. + +These tests require MINIMAX_API_KEY to be set and make real API calls. +Skip with: python -m pytest tests/test_minimax_tts_integration.py -k "not integration" +Or run only integration tests: MINIMAX_API_KEY=xxx python -m pytest tests/test_minimax_tts_integration.py +""" + +import json +import os +import sys +import tempfile +import unittest +from pathlib import Path + +# Add tools/ to path +sys.path.insert(0, str(Path(__file__).parent.parent / "tools")) + +MINIMAX_API_KEY = os.getenv("MINIMAX_API_KEY") +SKIP_REASON = "MINIMAX_API_KEY not set — skipping integration tests" + + +@unittest.skipUnless(MINIMAX_API_KEY, SKIP_REASON) +class TestMiniMaxTTSIntegration(unittest.TestCase): + """Integration tests that call the real MiniMax TTS API.""" + + def test_generate_audio_hd(self): + """Test generating audio with speech-2.8-hd model.""" + from minimax_tts import generate_audio + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test_hd.mp3") + result = generate_audio( + text="Hello, this is a test of MiniMax text to speech.", + output_path=output_path, + voice="English_Graceful_Lady", + model="hd", + verbose=False, + ) + + self.assertTrue(result["success"], f"Failed: {result.get('error')}") + self.assertTrue(Path(output_path).exists()) + self.assertGreater(Path(output_path).stat().st_size, 1000) + + def test_generate_audio_turbo(self): + """Test generating audio with speech-2.8-turbo model.""" + from minimax_tts import generate_audio + + with tempfile.TemporaryDirectory() as tmpdir: + output_path = os.path.join(tmpdir, "test_turbo.mp3") + result = generate_audio( + text="Quick turbo test of MiniMax.", + output_path=output_path, + voice="English_Persuasive_Man", + model="turbo", + verbose=False, + ) + + self.assertTrue(result["success"], f"Failed: {result.get('error')}") + self.assertTrue(Path(output_path).exists()) + + def test_voiceover_minimax_single_file(self): + """Test voiceover.py with --provider minimax in single-file mode.""" + import subprocess + + with tempfile.TemporaryDirectory() as tmpdir: + script_path = os.path.join(tmpdir, "script.txt") + output_path = os.path.join(tmpdir, "voiceover.mp3") + + Path(script_path).write_text("Integration test voiceover with MiniMax.") + + result = subprocess.run( + [ + sys.executable, "tools/voiceover.py", + "--provider", "minimax", + "--script", script_path, + "--output", output_path, + "--json", + ], + capture_output=True, + text=True, + cwd=str(Path(__file__).parent.parent), + env={**os.environ, "MINIMAX_API_KEY": MINIMAX_API_KEY}, + ) + + self.assertEqual(result.returncode, 0, f"stderr: {result.stderr}") + output = json.loads(result.stdout) + self.assertTrue(output.get("success")) + self.assertEqual(output["provider"], "minimax") + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/config.py b/tools/config.py index a8319e7..5dc2b4d 100644 --- a/tools/config.py +++ b/tools/config.py @@ -57,6 +57,13 @@ def get_default_output_dir(project_path: str | None = None) -> Path: return find_workspace_root() / "public" / "audio" +def get_minimax_api_key() -> str | None: + """Get MiniMax API key from environment.""" + from dotenv import load_dotenv + load_dotenv() + return os.getenv("MINIMAX_API_KEY") + + def get_runpod_api_key() -> str | None: """Get RunPod API key from environment.""" from dotenv import load_dotenv diff --git a/tools/minimax_tts.py b/tools/minimax_tts.py new file mode 100644 index 0000000..bacd1ec --- /dev/null +++ b/tools/minimax_tts.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python3 +""" +Generate speech using MiniMax Cloud TTS API. + +MiniMax offers high-quality text-to-speech with multiple voice presets, supporting +both English and Chinese voices. No GPU required — runs entirely in the cloud. + +Usage: + # Basic usage + python tools/minimax_tts.py --text "Hello world" --output hello.mp3 + + # Choose a voice + python tools/minimax_tts.py --text "Hello world" --voice English_Graceful_Lady --output hello.mp3 + + # Choose model (hd or turbo) + python tools/minimax_tts.py --text "Hello world" --model turbo --output fast.mp3 + + # List available voices + python tools/minimax_tts.py --list-voices + + # JSON output for machine parsing + python tools/minimax_tts.py --text "Hello world" --output hello.mp3 --json + +Setup: + 1. Get an API key from https://www.minimaxi.com/ + 2. Add to .env: + echo "MINIMAX_API_KEY=your_key_here" >> .env +""" + +import argparse +import json +import os +import sys +import time +from pathlib import Path + +import requests + +sys.path.insert(0, str(Path(__file__).parent)) + +# MiniMax TTS API endpoint +MINIMAX_TTS_API_URL = "https://api.minimax.io/v1/t2a_v2" + +# speech-2.8-hd: high quality, slower +# speech-2.8-turbo: faster, slightly lower quality +MINIMAX_TTS_MODELS = { + "hd": "speech-2.8-hd", + "turbo": "speech-2.8-turbo", +} + +# Verified voice IDs +MINIMAX_VOICES = { + # English voices + "English_Graceful_Lady": "English", + "English_Insightful_Speaker": "English", + "English_radiant_girl": "English", + "English_Persuasive_Man": "English", + "English_Lucky_Robot": "English", + # Bilingual / Chinese voices + "Wise_Woman": "Chinese", + "cute_boy": "Chinese", + "lovely_girl": "Chinese", + "Friendly_Person": "Chinese", + "Inspirational_girl": "Chinese", + "Deep_Voice_Man": "Chinese", + "sweet_girl": "Chinese", +} + +# Default voice for English content +DEFAULT_VOICE = "English_Graceful_Lady" +DEFAULT_MODEL = "hd" + + +def get_audio_duration(file_path: str) -> float | None: + """Get audio duration in seconds using ffprobe.""" + import subprocess + + try: + result = subprocess.run( + [ + "ffprobe", + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "csv=p=0", + file_path, + ], + capture_output=True, + text=True, + ) + if result.returncode == 0: + return float(result.stdout.strip()) + except (FileNotFoundError, ValueError): + pass + return None + + +def generate_audio( + text: str, + output_path: str, + voice: str = DEFAULT_VOICE, + model: str = DEFAULT_MODEL, + speed: float = 1.0, + volume: float = 1.0, + pitch: int = 0, + timeout: int = 60, + verbose: bool = True, +) -> dict: + """Generate audio using MiniMax Cloud TTS API. + + This is the main entry point, importable by voiceover.py. + Returns dict with: success, output, duration_seconds, duration_frames_30fps + + Args: + text: Text to synthesize. + output_path: Path to save the output audio file (.mp3). + voice: Voice ID (see MINIMAX_VOICES). + model: Model shorthand — "hd" or "turbo". + speed: Speech speed multiplier (0.5-2.0, default 1.0). + volume: Volume level (0.1-10.0, default 1.0). + pitch: Pitch shift in semitones (-12 to 12, default 0). + timeout: Request timeout in seconds. + verbose: Print progress messages. + """ + from config import find_workspace_root + + start_time = time.time() + + # Resolve API key + api_key = os.getenv("MINIMAX_API_KEY") + if not api_key: + from dotenv import load_dotenv + + load_dotenv(find_workspace_root() / ".env") + api_key = os.getenv("MINIMAX_API_KEY") + + if not api_key: + return { + "success": False, + "error": ( + "MINIMAX_API_KEY not set. Get one at https://www.minimaxi.com/ " + "and add to .env:\n echo \"MINIMAX_API_KEY=your_key\" >> .env" + ), + } + + # Resolve model name + model_id = MINIMAX_TTS_MODELS.get(model, model) + + if verbose: + print(f"Generating speech with MiniMax TTS ({model_id})...", file=sys.stderr) + print(f" Voice: {voice}", file=sys.stderr) + print(f" Text: {len(text)} chars", file=sys.stderr) + + # Build request payload + payload = { + "model": model_id, + "text": text, + "voice_setting": { + "voice_id": voice, + "speed": speed, + "vol": volume, + "pitch": pitch, + }, + "audio_setting": { + "format": "mp3", + }, + } + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + } + + try: + response = requests.post( + MINIMAX_TTS_API_URL, + json=payload, + headers=headers, + timeout=timeout, + ) + except requests.exceptions.Timeout: + return {"success": False, "error": f"Request timed out after {timeout}s"} + except requests.exceptions.RequestException as e: + return {"success": False, "error": f"Request failed: {e}"} + + if response.status_code != 200: + return { + "success": False, + "error": f"API returned HTTP {response.status_code}: {response.text[:500]}", + } + + try: + result = response.json() + except json.JSONDecodeError: + return {"success": False, "error": "Invalid JSON response from API"} + + # Check for API-level errors + base_resp = result.get("base_resp", {}) + if base_resp.get("status_code", 0) != 0: + return { + "success": False, + "error": f"API error {base_resp.get('status_code')}: {base_resp.get('status_msg', 'Unknown')}", + } + + # Extract audio data (hex-encoded bytes) + audio_hex = result.get("data", {}).get("audio") + if not audio_hex: + return {"success": False, "error": "No audio data in response"} + + # Decode hex to bytes and save + try: + audio_bytes = bytes.fromhex(audio_hex) + except ValueError as e: + return {"success": False, "error": f"Failed to decode audio data: {e}"} + + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + Path(output_path).write_bytes(audio_bytes) + + elapsed = time.time() - start_time + duration = get_audio_duration(output_path) + + if verbose: + size_kb = Path(output_path).stat().st_size // 1024 + print(f" Saved: {output_path} ({size_kb}KB)", file=sys.stderr) + if duration: + print(f" Duration: {duration:.1f}s", file=sys.stderr) + print(f" Elapsed: {elapsed:.1f}s", file=sys.stderr) + + result_dict = { + "success": True, + "output": output_path, + "script_chars": len(text), + } + if duration: + result_dict["duration_seconds"] = round(duration, 2) + result_dict["duration_frames_30fps"] = int(duration * 30) + + return result_dict + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Generate speech using MiniMax Cloud TTS", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic usage + python tools/minimax_tts.py --text "Hello world" --output hello.mp3 + + # Choose a voice + python tools/minimax_tts.py --text "Hello world" --voice English_Persuasive_Man --output hello.mp3 + + # Fast generation with turbo model + python tools/minimax_tts.py --text "Hello world" --model turbo --output fast.mp3 + + # List voices + python tools/minimax_tts.py --list-voices + """, + ) + + parser.add_argument( + "--text", + "-t", + type=str, + help="Text to synthesize", + ) + parser.add_argument( + "--output", + "-o", + type=str, + help="Output audio file path (.mp3)", + ) + parser.add_argument( + "--voice", + "-v", + type=str, + default=DEFAULT_VOICE, + help=f"Voice ID (default: {DEFAULT_VOICE}). Use --list-voices to see options.", + ) + parser.add_argument( + "--model", + "-m", + type=str, + default=DEFAULT_MODEL, + choices=list(MINIMAX_TTS_MODELS.keys()), + help=f"Model quality — hd (high quality) or turbo (faster). Default: {DEFAULT_MODEL}.", + ) + parser.add_argument( + "--speed", + type=float, + default=1.0, + help="Speech speed multiplier (0.5-2.0, default: 1.0)", + ) + parser.add_argument( + "--volume", + type=float, + default=1.0, + help="Volume level (0.1-10.0, default: 1.0)", + ) + parser.add_argument( + "--pitch", + type=int, + default=0, + help="Pitch shift in semitones (-12 to 12, default: 0)", + ) + parser.add_argument( + "--timeout", + type=int, + default=60, + help="Request timeout in seconds (default: 60)", + ) + + # Utility + parser.add_argument( + "--list-voices", + action="store_true", + help="List available voices and exit", + ) + parser.add_argument( + "--json", + action="store_true", + help="Output result as JSON", + ) + + return parser.parse_args() + + +def main(): + args = parse_args() + verbose = not args.json + + # Handle --list-voices + if args.list_voices: + print("MiniMax TTS Voices:") + print() + print(f" {'Voice ID':<30} {'Language'}") + print(f" {'-' * 30} {'-' * 10}") + for voice_id, lang in MINIMAX_VOICES.items(): + print(f" {voice_id:<30} {lang}") + print() + print("Models:") + print(f" hd — speech-2.8-hd (high quality, recommended)") + print(f" turbo — speech-2.8-turbo (faster, good for drafts)") + print() + print("Usage: --voice English_Persuasive_Man --model hd") + sys.exit(0) + + # Validate required arguments + if not args.text: + print("Error: --text is required", file=sys.stderr) + sys.exit(1) + if not args.output: + print("Error: --output is required", file=sys.stderr) + sys.exit(1) + + # Validate voice + if args.voice not in MINIMAX_VOICES: + print( + f"Warning: '{args.voice}' is not a verified voice ID. " + f"Use --list-voices to see options.", + file=sys.stderr, + ) + + from dotenv import load_dotenv + + load_dotenv() + + result = generate_audio( + text=args.text, + output_path=args.output, + voice=args.voice, + model=args.model, + speed=args.speed, + volume=args.volume, + pitch=args.pitch, + timeout=args.timeout, + verbose=verbose, + ) + + if not result.get("success"): + if args.json: + print(json.dumps(result, indent=2)) + else: + print(f"Error: {result.get('error', 'Unknown error')}", file=sys.stderr) + sys.exit(1) + + if args.json: + print(json.dumps(result, indent=2)) + else: + duration = result.get("duration_seconds", 0) + print(f"Generated: {result['output']}") + if duration: + print(f" Duration: {duration:.1f}s ({int(duration * 30)} frames @ 30fps)") + + +if __name__ == "__main__": + main() diff --git a/tools/voiceover.py b/tools/voiceover.py index c066906..27c5ff5 100644 --- a/tools/voiceover.py +++ b/tools/voiceover.py @@ -37,7 +37,7 @@ # Add parent to path for local imports sys.path.insert(0, str(Path(__file__).parent)) -from config import get_brand_dir, get_elevenlabs_api_key, get_voice_id, load_brand_voice_config +from config import get_brand_dir, get_elevenlabs_api_key, get_minimax_api_key, get_voice_id, load_brand_voice_config def _get_elevenlabs_imports(): @@ -110,7 +110,7 @@ def parse_args(): "--provider", type=str, default="elevenlabs", - choices=["elevenlabs", "qwen3"], + choices=["elevenlabs", "qwen3", "minimax"], help="TTS provider (default: elevenlabs)", ) @@ -199,6 +199,33 @@ def parse_args(): help="Qwen3-TTS nucleus sampling (default: model default ~0.8, range: 0.1-1.0)", ) + # MiniMax-specific options + parser.add_argument( + "--minimax-voice", + type=str, + default="English_Graceful_Lady", + help="MiniMax voice ID (default: English_Graceful_Lady). Use 'python tools/minimax_tts.py --list-voices' to see options.", + ) + parser.add_argument( + "--minimax-model", + type=str, + default="hd", + choices=["hd", "turbo"], + help="MiniMax model — hd (high quality) or turbo (faster). Default: hd.", + ) + parser.add_argument( + "--volume", + type=float, + default=1.0, + help="MiniMax volume level (0.1-10.0, default: 1.0)", + ) + parser.add_argument( + "--pitch", + type=int, + default=0, + help="MiniMax pitch shift in semitones (-12 to 12, default: 0)", + ) + # Cloud GPU provider (for Qwen3-TTS) parser.add_argument( "--cloud", @@ -342,6 +369,32 @@ def generate_single_audio_qwen3( ) +def generate_single_audio_minimax( + script: str, + output_path: Path, + voice: str = "English_Graceful_Lady", + model: str = "hd", + speed: float = 1.0, + volume: float = 1.0, + pitch: int = 0, +) -> dict: + """Generate a single audio file from script text using MiniMax TTS. Returns result dict.""" + from minimax_tts import generate_audio + + output_path.parent.mkdir(parents=True, exist_ok=True) + + return generate_audio( + text=script, + output_path=str(output_path), + voice=voice, + model=model, + speed=speed, + volume=volume, + pitch=pitch, + verbose=False, + ) + + def process_scene_directory( scene_dir: Path, dry_run: bool = False, @@ -365,6 +418,11 @@ def process_scene_directory( temperature: float | None = None, top_p: float | None = None, cloud: str = "runpod", + # MiniMax params + minimax_voice: str = "English_Graceful_Lady", + minimax_model: str = "hd", + volume: float = 1.0, + pitch: int = 0, ) -> list[dict]: """Process all .txt files in directory, generate .mp3 for each.""" txt_files = sorted(scene_dir.glob("*.txt")) @@ -433,6 +491,16 @@ def process_scene_directory( top_p=top_p, cloud=cloud, ) + elif provider == "minimax": + result = generate_single_audio_minimax( + script=script, + output_path=mp3_file, + voice=minimax_voice, + model=minimax_model, + speed=speed, + volume=volume, + pitch=pitch, + ) else: result = generate_single_audio( client=client, @@ -558,6 +626,12 @@ def main(): # Apply voice ID from brand if not explicitly provided if not args.voice_id and voice_config.get("voiceId") and voice_config["voiceId"] != "YOUR_VOICE_ID_HERE": args.voice_id = voice_config["voiceId"] + elif provider == "minimax": + minimax_cfg = voice_config.get("minimax", {}) + if minimax_cfg.get("voice") and args.minimax_voice == "English_Graceful_Lady": + args.minimax_voice = minimax_cfg["voice"] + if minimax_cfg.get("model") and args.minimax_model == "hd": + args.minimax_model = minimax_cfg["model"] # Resolve tone preset → instruct text for Qwen3 if provider == "qwen3" and (args.tone or args.instruct): @@ -610,6 +684,27 @@ def main(): ElevenLabs, _, _ = _get_elevenlabs_imports() client = ElevenLabs(api_key=api_key) + elif provider == "minimax": + api_key = get_minimax_api_key() + if not api_key: + print( + "Error: No MiniMax API key found.\n" + "\n" + "You have 3 options:\n" + "\n" + " 1. Add a MiniMax key:\n" + " echo \"MINIMAX_API_KEY=your_key\" >> .env\n" + " (Get one at https://www.minimaxi.com/)\n" + "\n" + " 2. Use Qwen3-TTS instead (free, self-hosted):\n" + " python3 tools/voiceover.py --provider qwen3 --speaker Ryan --scene-dir public/audio/scenes --json\n" + "\n" + " 3. Skip voiceover entirely:\n" + " Videos render fine without audio. Add voiceover later when ready.", + file=sys.stderr, + ) + sys.exit(1) + # Per-scene mode if args.scene_dir: scene_dir = Path(args.scene_dir) @@ -619,7 +714,8 @@ def main(): if not args.json: txt_count = len(list(scene_dir.glob("*.txt"))) - provider_label = "Qwen3-TTS" if provider == "qwen3" else "ElevenLabs" + provider_labels = {"qwen3": "Qwen3-TTS", "minimax": "MiniMax TTS", "elevenlabs": "ElevenLabs"} + provider_label = provider_labels.get(provider, provider) print(f"Processing {txt_count} scene scripts in {scene_dir} ({provider_label})...", file=sys.stderr) if args.dry_run: @@ -645,6 +741,10 @@ def main(): temperature=args.temperature, top_p=args.top_p, cloud=args.cloud, + minimax_voice=args.minimax_voice, + minimax_model=args.minimax_model, + volume=args.volume, + pitch=args.pitch, ) result = { "dry_run": True, @@ -663,6 +763,14 @@ def main(): "style": args.style, "speed": args.speed, } + elif provider == "minimax": + result["voice"] = args.minimax_voice + result["model"] = args.minimax_model + result["settings"] = { + "speed": args.speed, + "volume": args.volume, + "pitch": args.pitch, + } else: result["speaker"] = args.speaker result["language"] = args.language @@ -699,6 +807,10 @@ def main(): temperature=args.temperature, top_p=args.top_p, cloud=args.cloud, + minimax_voice=args.minimax_voice, + minimax_model=args.minimax_model, + volume=args.volume, + pitch=args.pitch, ) # Build final result @@ -761,6 +873,14 @@ def main(): "style": args.style, "speed": args.speed, } + elif provider == "minimax": + result["voice"] = args.minimax_voice + result["model"] = args.minimax_model + result["settings"] = { + "speed": args.speed, + "volume": args.volume, + "pitch": args.pitch, + } else: result["speaker"] = args.speaker result["language"] = args.language @@ -777,6 +897,9 @@ def main(): if provider == "elevenlabs": print(f" Voice ID: {voice_id}") print(f" Model: {args.model}") + elif provider == "minimax": + print(f" Voice: {args.minimax_voice}") + print(f" Model: {args.minimax_model}") else: print(f" Speaker: {args.speaker}") print(f" Language: {args.language}") @@ -786,7 +909,8 @@ def main(): # Generate voiceover if not args.json: - provider_label = "Qwen3-TTS" if provider == "qwen3" else "ElevenLabs" + provider_labels = {"qwen3": "Qwen3-TTS", "minimax": "MiniMax TTS", "elevenlabs": "ElevenLabs"} + provider_label = provider_labels.get(provider, provider) print(f"Generating voiceover ({len(script)} chars, {provider_label})...", file=sys.stderr) if provider == "qwen3": @@ -802,6 +926,16 @@ def main(): top_p=args.top_p, cloud=args.cloud, ) + elif provider == "minimax": + result = generate_single_audio_minimax( + script=script, + output_path=output_path, + voice=args.minimax_voice, + model=args.minimax_model, + speed=args.speed, + volume=args.volume, + pitch=args.pitch, + ) else: result = generate_single_audio( client=client,