Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion app_rvc.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from dotenv import load_dotenv
load_dotenv()
import gradio as gr
from soni_translate.logging_setup import (
logger,
Expand Down Expand Up @@ -40,6 +42,7 @@
BARK_VOICES_LIST,
VITS_VOICES_LIST,
OPENAI_TTS_MODELS,
camb_tts_voices_list,
)
from soni_translate.utils import (
remove_files,
Expand Down Expand Up @@ -120,6 +123,7 @@ def __init__(self, piper_enabled, xtts_enabled):
self.list_bark = list(BARK_VOICES_LIST.keys())
self.list_vits = list(VITS_VOICES_LIST.keys())
self.list_openai_tts = OPENAI_TTS_MODELS
self.list_camb_tts = camb_tts_voices_list()
self.piper_enabled = piper_enabled
self.list_vits_onnx = (
piper_tts_voices_list() if self.piper_enabled else []
Expand All @@ -135,6 +139,7 @@ def tts_list(self):
+ self.list_bark
+ self.list_vits
+ self.list_openai_tts
+ self.list_camb_tts
+ self.list_vits_onnx
)
return list_tts
Expand Down Expand Up @@ -1133,9 +1138,11 @@ def multilingual_media_conversion(
burn_subtitles_to_video
], {}):
# Merge new audio + video
# Use AAC encoding (-c:a aac) instead of -c:a copy because the
# mix audio is MP3, and MP3-in-MP4 won't play on macOS/QuickTime.
remove_files(video_output_file)
run_command(
f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output_file}"
f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a aac -b:a 192k -map 0:v -map 1:a -shortest {video_output_file}"
)

output = media_out(
Expand Down
43 changes: 43 additions & 0 deletions soni_translate/language_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,49 @@ def fix_code_language(translate_to, syntax="google"):
">shimmer HD OpenAI-TTS"
]

# CAMB AI TTS voices - fetched dynamically from the API
# Format: ">voice_id voice_name CAMB-TTS"
# The language is passed separately at TTS time based on the target language,
# so voices are not language-specific in the dropdown.

CAMB_GENDER_MAP = {1: "Male", 2: "Female"}


def camb_tts_voices_list():
"""Fetch available voices from the CAMB AI API."""
import os
import requests
import logging

logger = logging.getLogger(__name__)
api_key = os.getenv("CAMB_API_KEY", "")
if not api_key:
logger.info("CAMB_API_KEY not set, CAMB TTS voices disabled")
return []

try:
response = requests.get(
"https://client.camb.ai/apis/list-voices",
headers={"x-api-key": api_key},
timeout=30,
)
response.raise_for_status()
voices = response.json()

formatted = []
for v in voices:
voice_id = v.get("id")
name = v.get("voice_name", "unknown")
gender = CAMB_GENDER_MAP.get(v.get("gender"), "")
label = f"{name} {gender}".strip()
formatted.append(f">{voice_id} {label} CAMB-TTS")

logger.info(f"Loaded {len(formatted)} CAMB TTS voices")
return sorted(formatted)
except Exception as error:
logger.warning(f"Failed to fetch CAMB TTS voices: {error}")
return []

LANGUAGE_CODE_IN_THREE_LETTERS = {
"Automatic detection": "aut",
"ar": "ara",
Expand Down
127 changes: 125 additions & 2 deletions soni_translate/text_to_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,13 @@ def edge_tts_voices_list():
voice_entry["Gender"] = line.split(": ")[1]
voices.append(voice_entry)

# Fallback: parse table format (newer edge-tts versions)
if not voices:
for line in lines:
parts = line.split()
if len(parts) >= 2 and "-" in parts[0] and parts[1] in ("Male", "Female"):
voices.append({"Name": parts[0], "Gender": parts[1]})

formatted_voices = [
f"{entry['Name']}-{entry['Gender']}" for entry in voices
]
Expand Down Expand Up @@ -939,6 +946,112 @@ def segments_openai_tts(
error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)


# =====================================
# CAMB AI TTS
# =====================================


def segments_camb_tts(
filtered_camb_tts_segments, TRANSLATE_AUDIO_TO
):
import requests
import os as _os

api_key = _os.getenv("CAMB_API_KEY", "")
if not api_key:
raise TTS_OperationError(
"CAMB_API_KEY environment variable is required for CAMB AI TTS"
)

sampling_rate = 24000
CAMB_API_BASE = "https://client.camb.ai/apis"

for segment in tqdm(filtered_camb_tts_segments["segments"]):
speaker = segment["speaker"] # noqa
text = segment["text"].strip()
start = segment["start"]
tts_name = segment["tts_name"]

# Parse voice_id from tts_name
# Format: ">voice_id name [gender] CAMB-TTS"
voice_id = int(tts_name.split()[0][1:]) # Remove '>' prefix

# Map SoniTranslate language code to CAMB locale
CAMB_LANG_MAP = {
"ar": "ar-sa", "zh": "zh-cn", "zh-CN": "zh-cn",
"zh-TW": "zh-cn", "nl": "nl-nl", "en": "en-us",
"fr": "fr-fr", "de": "de-de", "hi": "hi-in",
"id": "id-id", "it": "it-it", "ja": "ja-jp",
"ko": "ko-kr", "pl": "pl-pl", "pt": "pt-br",
"ru": "ru-ru", "es": "es-es", "tr": "tr-tr",
"ta": "ta-in", "te": "te-in", "bn": "bn-in",
}
camb_lang = CAMB_LANG_MAP.get(
TRANSLATE_AUDIO_TO, "en-us"
)

# make the tts audio
filename = f"audio/{start}.ogg"
logger.info(f"{text} >> {filename}")

try:
response = requests.post(
f"{CAMB_API_BASE}/tts-stream",
headers={
"x-api-key": api_key,
"Content-Type": "application/json",
},
json={
"text": text,
"voice_id": voice_id,
"language": camb_lang,
"speech_model": "mars-flash",
"output_configuration": {"format": "wav"},
},
)
response.raise_for_status()

audio_bytes = response.content

# Parse WAV properly β€” find the 'data' chunk offset
# instead of hardcoding a header size
import struct
data_offset = 12 # skip RIFF header
while data_offset < len(audio_bytes) - 8:
chunk_id = audio_bytes[data_offset:data_offset + 4]
chunk_size = struct.unpack(
'<I', audio_bytes[data_offset + 4:data_offset + 8]
)[0]
if chunk_id == b'data':
data_offset += 8
break
data_offset += 8 + chunk_size
else:
data_offset = 0 # fallback: treat as raw PCM

speech_output = np.frombuffer(
audio_bytes[data_offset:], dtype=np.int16
)

# Save file
data_tts = pad_array(
speech_output,
sampling_rate,
)

write_chunked(
file=filename,
samplerate=sampling_rate,
data=data_tts,
format="ogg",
subtype="vorbis",
)
verify_saved_file_and_size(filename)

except Exception as error:
error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)


# =====================================
# Select task TTS
# =====================================
Expand Down Expand Up @@ -1022,6 +1135,7 @@ def audio_segmentation_to_voice(
pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
pattern_vits_onnx = re.compile(r".* VITS-onnx$")
pattern_openai_tts = re.compile(r".* OpenAI-TTS$")
pattern_camb_tts = re.compile(r".* CAMB-TTS$")

all_segments = result_diarize["segments"]

Expand All @@ -1035,6 +1149,9 @@ def audio_segmentation_to_voice(
speakers_openai_tts = find_spkr(
pattern_openai_tts, speaker_to_voice, all_segments
)
speakers_camb_tts = find_spkr(
pattern_camb_tts, speaker_to_voice, all_segments
)

# Filter method in segments
filtered_edge = filter_by_speaker(speakers_edge, all_segments)
Expand All @@ -1043,6 +1160,7 @@ def audio_segmentation_to_voice(
filtered_coqui = filter_by_speaker(speakers_coqui, all_segments)
filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments)
filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments)
filtered_camb_tts = filter_by_speaker(speakers_camb_tts, all_segments)

# Infer
if filtered_edge["segments"]:
Expand Down Expand Up @@ -1072,6 +1190,9 @@ def audio_segmentation_to_voice(
if filtered_openai_tts["segments"]:
logger.info(f"OpenAI TTS: {speakers_openai_tts}")
segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO) # wav
if filtered_camb_tts["segments"]:
logger.info(f"CAMB AI TTS: {speakers_camb_tts}")
segments_camb_tts(filtered_camb_tts, TRANSLATE_AUDIO_TO) # wav

[result.pop("tts_name", None) for result in result_diarize["segments"]]
return [
Expand All @@ -1080,7 +1201,8 @@ def audio_segmentation_to_voice(
speakers_vits,
speakers_coqui,
speakers_vits_onnx,
speakers_openai_tts
speakers_openai_tts,
speakers_camb_tts,
]


Expand All @@ -1099,7 +1221,8 @@ def accelerate_segments(
speakers_vits,
speakers_coqui,
speakers_vits_onnx,
speakers_openai_tts
speakers_openai_tts,
speakers_camb_tts,
) = valid_speakers

create_directories(f"{folder_output}/audio/")
Expand Down