diff --git a/app_rvc.py b/app_rvc.py index 47718cd..60a9333 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -1,3 +1,5 @@ +from dotenv import load_dotenv +load_dotenv() import gradio as gr from soni_translate.logging_setup import ( logger, @@ -40,6 +42,7 @@ BARK_VOICES_LIST, VITS_VOICES_LIST, OPENAI_TTS_MODELS, + camb_tts_voices_list, ) from soni_translate.utils import ( remove_files, @@ -120,6 +123,7 @@ def __init__(self, piper_enabled, xtts_enabled): self.list_bark = list(BARK_VOICES_LIST.keys()) self.list_vits = list(VITS_VOICES_LIST.keys()) self.list_openai_tts = OPENAI_TTS_MODELS + self.list_camb_tts = camb_tts_voices_list() self.piper_enabled = piper_enabled self.list_vits_onnx = ( piper_tts_voices_list() if self.piper_enabled else [] @@ -135,6 +139,7 @@ def tts_list(self): + self.list_bark + self.list_vits + self.list_openai_tts + + self.list_camb_tts + self.list_vits_onnx ) return list_tts @@ -1133,9 +1138,11 @@ def multilingual_media_conversion( burn_subtitles_to_video ], {}): # Merge new audio + video + # Use AAC encoding (-c:a aac) instead of -c:a copy because the + # mix audio is MP3, and MP3-in-MP4 won't play on macOS/QuickTime. remove_files(video_output_file) run_command( - f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output_file}" + f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a aac -b:a 192k -map 0:v -map 1:a -shortest {video_output_file}" ) output = media_out( diff --git a/soni_translate/language_configuration.py b/soni_translate/language_configuration.py index d0e4b06..0796a71 100644 --- a/soni_translate/language_configuration.py +++ b/soni_translate/language_configuration.py @@ -505,6 +505,49 @@ def fix_code_language(translate_to, syntax="google"): ">shimmer HD OpenAI-TTS" ] +# CAMB AI TTS voices - fetched dynamically from the API +# Format: ">voice_id voice_name CAMB-TTS" +# The language is passed separately at TTS time based on the target language, +# so voices are not language-specific in the dropdown. + +CAMB_GENDER_MAP = {1: "Male", 2: "Female"} + + +def camb_tts_voices_list(): + """Fetch available voices from the CAMB AI API.""" + import os + import requests + import logging + + logger = logging.getLogger(__name__) + api_key = os.getenv("CAMB_API_KEY", "") + if not api_key: + logger.info("CAMB_API_KEY not set, CAMB TTS voices disabled") + return [] + + try: + response = requests.get( + "https://client.camb.ai/apis/list-voices", + headers={"x-api-key": api_key}, + timeout=30, + ) + response.raise_for_status() + voices = response.json() + + formatted = [] + for v in voices: + voice_id = v.get("id") + name = v.get("voice_name", "unknown") + gender = CAMB_GENDER_MAP.get(v.get("gender"), "") + label = f"{name} {gender}".strip() + formatted.append(f">{voice_id} {label} CAMB-TTS") + + logger.info(f"Loaded {len(formatted)} CAMB TTS voices") + return sorted(formatted) + except Exception as error: + logger.warning(f"Failed to fetch CAMB TTS voices: {error}") + return [] + LANGUAGE_CODE_IN_THREE_LETTERS = { "Automatic detection": "aut", "ar": "ara", diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index 2626912..e5fceea 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -130,6 +130,13 @@ def edge_tts_voices_list(): voice_entry["Gender"] = line.split(": ")[1] voices.append(voice_entry) + # Fallback: parse table format (newer edge-tts versions) + if not voices: + for line in lines: + parts = line.split() + if len(parts) >= 2 and "-" in parts[0] and parts[1] in ("Male", "Female"): + voices.append({"Name": parts[0], "Gender": parts[1]}) + formatted_voices = [ f"{entry['Name']}-{entry['Gender']}" for entry in voices ] @@ -939,6 +946,112 @@ def segments_openai_tts( error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) +# ===================================== +# CAMB AI TTS +# ===================================== + + +def segments_camb_tts( + filtered_camb_tts_segments, TRANSLATE_AUDIO_TO +): + import requests + import os as _os + + api_key = _os.getenv("CAMB_API_KEY", "") + if not api_key: + raise TTS_OperationError( + "CAMB_API_KEY environment variable is required for CAMB AI TTS" + ) + + sampling_rate = 24000 + CAMB_API_BASE = "https://client.camb.ai/apis" + + for segment in tqdm(filtered_camb_tts_segments["segments"]): + speaker = segment["speaker"] # noqa + text = segment["text"].strip() + start = segment["start"] + tts_name = segment["tts_name"] + + # Parse voice_id from tts_name + # Format: ">voice_id name [gender] CAMB-TTS" + voice_id = int(tts_name.split()[0][1:]) # Remove '>' prefix + + # Map SoniTranslate language code to CAMB locale + CAMB_LANG_MAP = { + "ar": "ar-sa", "zh": "zh-cn", "zh-CN": "zh-cn", + "zh-TW": "zh-cn", "nl": "nl-nl", "en": "en-us", + "fr": "fr-fr", "de": "de-de", "hi": "hi-in", + "id": "id-id", "it": "it-it", "ja": "ja-jp", + "ko": "ko-kr", "pl": "pl-pl", "pt": "pt-br", + "ru": "ru-ru", "es": "es-es", "tr": "tr-tr", + "ta": "ta-in", "te": "te-in", "bn": "bn-in", + } + camb_lang = CAMB_LANG_MAP.get( + TRANSLATE_AUDIO_TO, "en-us" + ) + + # make the tts audio + filename = f"audio/{start}.ogg" + logger.info(f"{text} >> {filename}") + + try: + response = requests.post( + f"{CAMB_API_BASE}/tts-stream", + headers={ + "x-api-key": api_key, + "Content-Type": "application/json", + }, + json={ + "text": text, + "voice_id": voice_id, + "language": camb_lang, + "speech_model": "mars-flash", + "output_configuration": {"format": "wav"}, + }, + ) + response.raise_for_status() + + audio_bytes = response.content + + # Parse WAV properly — find the 'data' chunk offset + # instead of hardcoding a header size + import struct + data_offset = 12 # skip RIFF header + while data_offset < len(audio_bytes) - 8: + chunk_id = audio_bytes[data_offset:data_offset + 4] + chunk_size = struct.unpack( + '