Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions open_dubbing/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def read_parameters():
"--tts",
type=str,
default="mms",
choices=["mms", "coqui", "openai", "edge", "cli", "api"],
choices=["mms", "coqui", "openai", "edge", "cli", "api", "camb"],
help=(
"Text to Speech engine to use. Choices are:\n"
"'mms': Meta Multilingual Speech engine, supports +1100 languages.\n"
Expand All @@ -78,6 +78,7 @@ def read_parameters():
"'edge': Microsoft Edge TTS.\n"
"'cli': User defined TTS invoked from command line.\n"
"'api': Implements a user defined TTS API contract to enable non supported TTS.\n"
"'camb': CAMB AI TTS with voice cloning support, 16 languages.\n"
),
)
parser.add_argument(
Expand All @@ -89,12 +90,13 @@ def read_parameters():
"--stt",
type=str,
default="auto",
choices=["auto", "faster-whisper", "transformers"],
choices=["auto", "faster-whisper", "transformers", "camb"],
help=(
"Speech to text. Choices are:\n"
"'auto': Autoselect best implementation.\n"
"'faster-whisper': Faster-whisper's OpenAI whisper implementation.\n"
"'transformers': Transformers OpenAI whisper implementation.\n"
"'camb': CAMB AI cloud transcription.\n"
),
)
parser.add_argument(
Expand All @@ -107,11 +109,12 @@ def read_parameters():
"--translator",
type=str,
default="nllb",
choices=["nllb", "apertium"],
choices=["nllb", "apertium", "camb"],
help=(
"Translation engine to use. Choices are:\n"
"'nllb': Meta's no Language Left Behind (NLLB).\n"
"'apertium': Apertium compatible API server.\n"
"'camb': CAMB AI cloud translation, 16 languages.\n"
),
)
parser.add_argument(
Expand Down Expand Up @@ -190,6 +193,12 @@ def read_parameters():
default="",
help=("TTS api server URL when using the 'API' tts"),
)
parser.add_argument(
"--camb_api_key",
default=None,
help="CAMB AI API key used for CAMB TTS, translation, and STT. Can also be set via CAMB_API_KEY environment variable.",
)

parser.add_argument(
"--update",
action="store_true",
Expand Down
1 change: 1 addition & 0 deletions open_dubbing/exit_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ class ExitCode(IntEnum):
UPDATE_MISSING_FILES = 111
NO_OPENAI_TTS = 112
NO_OPENAI_KEY = 113
NO_CAMB_KEY = 114
43 changes: 40 additions & 3 deletions open_dubbing/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def _get_selected_tts(
tts_api_server: str,
device: str,
openai_api_key: str,
camb_api_key: str = "",
):
if selected_tts == "mms":
tts = TextToSpeechMMS(device)
Expand Down Expand Up @@ -188,14 +189,19 @@ def _get_selected_tts(

key = _get_openai_key(key=openai_api_key)
tts = TextToSpeechOpenAI(device=device, api_key=key)
elif selected_tts == "camb":
from open_dubbing.text_to_speech_camb import TextToSpeechCamb

key = _get_camb_key(key=camb_api_key)
tts = TextToSpeechCamb(device=device, api_key=key)
else:
raise ValueError(f"Invalid tts value {selected_tts}")

return tts


def _get_selected_translator(
translator: str, nllb_model: str, apertium_server: str, device: str
translator: str, nllb_model: str, apertium_server: str, device: str, camb_api_key: str = ""
):
if translator == "nllb":
translation = TranslationNLLB(device)
Expand All @@ -208,12 +214,31 @@ def _get_selected_translator(

translation = TranslationApertium(device)
translation.set_server(server)
elif translator == "camb":
from open_dubbing.translation_camb import TranslationCamb

key = _get_camb_key(key=camb_api_key)
translation = TranslationCamb(device, api_key=key)
translation.load_model()
else:
raise ValueError(f"Invalid translator value {translator}")

return translation


def _get_camb_key(*, key: str):
if key:
return key

VAR = "CAMB_API_KEY"
key = os.getenv(VAR)
if key:
return key

msg = f"CAMB AI selected but no key has been passed as argument or defined in the environment variable {VAR}"
log_error_and_exit(msg, ExitCode.NO_CAMB_KEY)


def _get_openai_key(*, key: str):
if key:
return key
Expand All @@ -228,6 +253,9 @@ def _get_openai_key(*, key: str):


def main():
from dotenv import load_dotenv

load_dotenv()

args = CommandLine.read_parameters()
_init_logging(args.log_level)
Expand All @@ -240,20 +268,28 @@ def main():
msg = "You need to have ffmpeg (which includes ffprobe) installed."
log_error_and_exit(msg, ExitCode.NO_FFMPEG)

camb_api_key = getattr(args, "camb_api_key", None) or ""

tts = _get_selected_tts(
args.tts,
args.tts_cli_cfg_file,
args.tts_api_server,
args.device,
args.openai_api_key,
camb_api_key=camb_api_key,
)

if sys.platform == "darwin":
os.environ["TOKENIZERS_PARALLELISM"] = "false"

stt_type = args.stt
stt_text = args.stt
if stt_type == "faster-whisper" or (
if stt_type == "camb":
from open_dubbing.speech_to_text_camb import SpeechToTextCamb

key = _get_camb_key(key=camb_api_key)
stt = SpeechToTextCamb(device=args.device, api_key=key)
elif stt_type == "faster-whisper" or (
stt_type == "auto" and sys.platform != "darwin"
):
stt = SpeechToTextFasterWhisper(
Expand Down Expand Up @@ -282,7 +318,8 @@ def main():
logger().info(f"Detected language '{source_language}'")

translation = _get_selected_translator(
args.translator, args.nllb_model, args.apertium_server, args.device
args.translator, args.nllb_model, args.apertium_server, args.device,
camb_api_key=camb_api_key,
)

check_languages(
Expand Down
146 changes: 146 additions & 0 deletions open_dubbing/speech_to_text_camb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Copyright 2025 CAMB AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import array
import os
import time

import numpy as np
import requests

from open_dubbing import logger
from open_dubbing.speech_to_text import SpeechToText


CAMB_API_BASE = "https://client.camb.ai/apis"

# Mapping from ISO 639-3 to CAMB AI numeric language IDs (for transcription)
ISO_639_3_TO_CAMB_LANG_ID = {
"eng": 1,
"spa": 54,
"fra": 76,
"deu": 31,
"jpn": 88,
"hin": 81,
"por": 111,
"zho": 139,
"kor": 47,
"ita": 86,
"nld": 66,
"rus": 120,
"ara": 4,
"tam": 131,
"tel": 133,
"ben": 13,
}

# Reverse mapping for language detection
CAMB_LANG_ID_TO_ISO_639_3 = {v: k for k, v in ISO_639_3_TO_CAMB_LANG_ID.items()}


class SpeechToTextCamb(SpeechToText):

def __init__(self, *, device="cpu", model_name="camb", cpu_threads=0, api_key=""):
super().__init__(device=device, model_name=model_name, cpu_threads=cpu_threads)
self.api_key = api_key or os.getenv("CAMB_API_KEY", "")

def _headers(self):
return {"x-api-key": self.api_key}

def load_model(self):
# No local model to load for cloud API
pass

def get_languages(self):
return sorted(ISO_639_3_TO_CAMB_LANG_ID.keys())

def _transcribe(
self,
*,
vocals_filepath: str,
source_language_iso_639_1: str,
) -> str:
# Convert ISO 639-1 to ISO 639-3 then to CAMB language ID
from iso639 import Lang

iso_639_3 = Lang(source_language_iso_639_1).pt3
lang_id = ISO_639_3_TO_CAMB_LANG_ID.get(iso_639_3, 1)

# Upload file for transcription
with open(vocals_filepath, "rb") as f:
files = {"media_file": (os.path.basename(vocals_filepath), f)}
data = {"language": lang_id}
response = requests.post(
f"{CAMB_API_BASE}/transcribe",
headers=self._headers(),
files=files,
data=data,
)

if not response.ok:
logger().error(
f"speech_to_text_camb: Transcription API error {response.status_code}: {response.text[:200]}"
)
response.raise_for_status()
task_id = response.json()["task_id"]

# Poll for completion
max_attempts = 60
for _ in range(max_attempts):
status_response = requests.get(
f"{CAMB_API_BASE}/transcribe/{task_id}",
headers=self._headers(),
)
status_response.raise_for_status()
status_data = status_response.json()
status = status_data.get("status")

if status == "SUCCESS":
run_id = status_data.get("run_id")
result_response = requests.get(
f"{CAMB_API_BASE}/transcription-result/{run_id}",
headers=self._headers(),
)
result_response.raise_for_status()
result = result_response.json()

# The API returns {"transcript": [{"text": "...", ...}, ...]}
transcript_segments = result.get("transcript", [])
if transcript_segments:
transcription = " ".join(
seg.get("text", "") for seg in transcript_segments
).strip()
else:
transcription = result.get("transcription", result.get("text", ""))

logger().debug(
f"speech_to_text_camb._transcribe: '{transcription[:80]}'"
)
return transcription
elif status in ("ERROR", "FAILED"):
raise RuntimeError(
f"CAMB AI transcription failed for task {task_id}: {status_data}"
)

time.sleep(2)

raise TimeoutError(f"CAMB AI transcription timed out for task {task_id}")

def _get_audio_language(self, audio: array.array) -> str:
# CAMB AI doesn't have a dedicated language detection endpoint,
# so we default to English. For proper detection, use Whisper-based STT.
logger().debug(
"speech_to_text_camb._get_audio_language: defaulting to 'eng' (CAMB AI does not support language detection)"
)
return "eng"
Loading