Softcatala · neilruaro-camb · Mar 25, 2026
diff --git a/open_dubbing/command_line.py b/open_dubbing/command_line.py
@@ -69,7 +69,7 @@ def read_parameters():
             "--tts",
             type=str,
             default="mms",
-            choices=["mms", "coqui", "openai", "edge", "cli", "api"],
+            choices=["mms", "coqui", "openai", "edge", "cli", "api", "camb"],
             help=(
                 "Text to Speech engine to use. Choices are:\n"
                 "'mms': Meta Multilingual Speech engine, supports +1100 languages.\n"
@@ -78,6 +78,7 @@ def read_parameters():
                 "'edge': Microsoft Edge TTS.\n"
                 "'cli': User defined TTS invoked from command line.\n"
                 "'api': Implements a user defined TTS API contract to enable non supported TTS.\n"
+                "'camb': CAMB AI TTS with voice cloning support, 16 languages.\n"
             ),
         )
         parser.add_argument(
@@ -89,12 +90,13 @@ def read_parameters():
             "--stt",
             type=str,
             default="auto",
-            choices=["auto", "faster-whisper", "transformers"],
+            choices=["auto", "faster-whisper", "transformers", "camb"],
             help=(
                 "Speech to text. Choices are:\n"
                 "'auto': Autoselect best implementation.\n"
                 "'faster-whisper': Faster-whisper's OpenAI whisper implementation.\n"
                 "'transformers': Transformers OpenAI whisper implementation.\n"
+                "'camb': CAMB AI cloud transcription.\n"
             ),
         )
         parser.add_argument(
@@ -107,11 +109,12 @@ def read_parameters():
             "--translator",
             type=str,
             default="nllb",
-            choices=["nllb", "apertium"],
+            choices=["nllb", "apertium", "camb"],
             help=(
                 "Translation engine to use. Choices are:\n"
                 "'nllb': Meta's no Language Left Behind (NLLB).\n"
                 "'apertium': Apertium compatible API server.\n"
+                "'camb': CAMB AI cloud translation, 16 languages.\n"
             ),
         )
         parser.add_argument(
@@ -190,6 +193,12 @@ def read_parameters():
             default="",
             help=("TTS api server URL when using the 'API' tts"),
         )
+        parser.add_argument(
+            "--camb_api_key",
+            default=None,
+            help="CAMB AI API key used for CAMB TTS, translation, and STT. Can also be set via CAMB_API_KEY environment variable.",
+        )
+
         parser.add_argument(
             "--update",
             action="store_true",

diff --git a/open_dubbing/exit_code.py b/open_dubbing/exit_code.py
@@ -30,3 +30,4 @@ class ExitCode(IntEnum):
     UPDATE_MISSING_FILES = 111
     NO_OPENAI_TTS = 112
     NO_OPENAI_KEY = 113
+    NO_CAMB_KEY = 114
diff --git a/open_dubbing/main.py b/open_dubbing/main.py
@@ -151,6 +151,7 @@ def _get_selected_tts(
     tts_api_server: str,
     device: str,
     openai_api_key: str,
+    camb_api_key: str = "",
 ):
     if selected_tts == "mms":
         tts = TextToSpeechMMS(device)
@@ -188,14 +189,19 @@ def _get_selected_tts(
 
         key = _get_openai_key(key=openai_api_key)
         tts = TextToSpeechOpenAI(device=device, api_key=key)
+    elif selected_tts == "camb":
+        from open_dubbing.text_to_speech_camb import TextToSpeechCamb
+
+        key = _get_camb_key(key=camb_api_key)
+        tts = TextToSpeechCamb(device=device, api_key=key)
     else:
         raise ValueError(f"Invalid tts value {selected_tts}")
 
     return tts
 
 
 def _get_selected_translator(
-    translator: str, nllb_model: str, apertium_server: str, device: str
+    translator: str, nllb_model: str, apertium_server: str, device: str, camb_api_key: str = ""
 ):
     if translator == "nllb":
         translation = TranslationNLLB(device)
@@ -208,12 +214,31 @@ def _get_selected_translator(
 
         translation = TranslationApertium(device)
         translation.set_server(server)
+    elif translator == "camb":
+        from open_dubbing.translation_camb import TranslationCamb
+
+        key = _get_camb_key(key=camb_api_key)
+        translation = TranslationCamb(device, api_key=key)
+        translation.load_model()
     else:
         raise ValueError(f"Invalid translator value {translator}")
 
     return translation
 
 
+def _get_camb_key(*, key: str):
+    if key:
+        return key
+
+    VAR = "CAMB_API_KEY"
+    key = os.getenv(VAR)
+    if key:
+        return key
+
+    msg = f"CAMB AI selected but no key has been passed as argument or defined in the environment variable {VAR}"
+    log_error_and_exit(msg, ExitCode.NO_CAMB_KEY)
+
+
 def _get_openai_key(*, key: str):
     if key:
         return key
@@ -228,6 +253,9 @@ def _get_openai_key(*, key: str):
 
 
 def main():
+    from dotenv import load_dotenv
+
+    load_dotenv()
 
     args = CommandLine.read_parameters()
     _init_logging(args.log_level)
@@ -240,20 +268,28 @@ def main():
         msg = "You need to have ffmpeg (which includes ffprobe) installed."
         log_error_and_exit(msg, ExitCode.NO_FFMPEG)
 
+    camb_api_key = getattr(args, "camb_api_key", None) or ""
+
     tts = _get_selected_tts(
         args.tts,
         args.tts_cli_cfg_file,
         args.tts_api_server,
         args.device,
         args.openai_api_key,
+        camb_api_key=camb_api_key,
     )
 
     if sys.platform == "darwin":
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
     stt_type = args.stt
     stt_text = args.stt
-    if stt_type == "faster-whisper" or (
+    if stt_type == "camb":
+        from open_dubbing.speech_to_text_camb import SpeechToTextCamb
+
+        key = _get_camb_key(key=camb_api_key)
+        stt = SpeechToTextCamb(device=args.device, api_key=key)
+    elif stt_type == "faster-whisper" or (
         stt_type == "auto" and sys.platform != "darwin"
     ):
         stt = SpeechToTextFasterWhisper(
@@ -282,7 +318,8 @@ def main():
         logger().info(f"Detected language '{source_language}'")
 
     translation = _get_selected_translator(
-        args.translator, args.nllb_model, args.apertium_server, args.device
+        args.translator, args.nllb_model, args.apertium_server, args.device,
+        camb_api_key=camb_api_key,
     )
 
     check_languages(

diff --git a/open_dubbing/speech_to_text_camb.py b/open_dubbing/speech_to_text_camb.py
@@ -0,0 +1,146 @@
+# Copyright 2025 CAMB AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import array
+import os
+import time
+
+import numpy as np
+import requests
+
+from open_dubbing import logger
+from open_dubbing.speech_to_text import SpeechToText
+
+
+CAMB_API_BASE = "https://client.camb.ai/apis"
+
+# Mapping from ISO 639-3 to CAMB AI numeric language IDs (for transcription)
+ISO_639_3_TO_CAMB_LANG_ID = {
+    "eng": 1,
+    "spa": 54,
+    "fra": 76,
+    "deu": 31,
+    "jpn": 88,
+    "hin": 81,
+    "por": 111,
+    "zho": 139,
+    "kor": 47,
+    "ita": 86,
+    "nld": 66,
+    "rus": 120,
+    "ara": 4,
+    "tam": 131,
+    "tel": 133,
+    "ben": 13,
+}
+
+# Reverse mapping for language detection
+CAMB_LANG_ID_TO_ISO_639_3 = {v: k for k, v in ISO_639_3_TO_CAMB_LANG_ID.items()}
+
+
+class SpeechToTextCamb(SpeechToText):
+
+    def __init__(self, *, device="cpu", model_name="camb", cpu_threads=0, api_key=""):
+        super().__init__(device=device, model_name=model_name, cpu_threads=cpu_threads)
+        self.api_key = api_key or os.getenv("CAMB_API_KEY", "")
+
+    def _headers(self):
+        return {"x-api-key": self.api_key}
+
+    def load_model(self):
+        # No local model to load for cloud API
+        pass
+
+    def get_languages(self):
+        return sorted(ISO_639_3_TO_CAMB_LANG_ID.keys())
+
+    def _transcribe(
+        self,
+        *,
+        vocals_filepath: str,
+        source_language_iso_639_1: str,
+    ) -> str:
+        # Convert ISO 639-1 to ISO 639-3 then to CAMB language ID
+        from iso639 import Lang
+
+        iso_639_3 = Lang(source_language_iso_639_1).pt3
+        lang_id = ISO_639_3_TO_CAMB_LANG_ID.get(iso_639_3, 1)
+
+        # Upload file for transcription
+        with open(vocals_filepath, "rb") as f:
+            files = {"media_file": (os.path.basename(vocals_filepath), f)}
+            data = {"language": lang_id}
+            response = requests.post(
+                f"{CAMB_API_BASE}/transcribe",
+                headers=self._headers(),
+                files=files,
+                data=data,
+            )
+
+        if not response.ok:
+            logger().error(
+                f"speech_to_text_camb: Transcription API error {response.status_code}: {response.text[:200]}"
+            )
+        response.raise_for_status()
+        task_id = response.json()["task_id"]
+
+        # Poll for completion
+        max_attempts = 60
+        for _ in range(max_attempts):
+            status_response = requests.get(
+                f"{CAMB_API_BASE}/transcribe/{task_id}",
+                headers=self._headers(),
+            )
+            status_response.raise_for_status()
+            status_data = status_response.json()
+            status = status_data.get("status")
+
+            if status == "SUCCESS":
+                run_id = status_data.get("run_id")
+                result_response = requests.get(
+                    f"{CAMB_API_BASE}/transcription-result/{run_id}",
+                    headers=self._headers(),
+                )
+                result_response.raise_for_status()
+                result = result_response.json()
+
+                # The API returns {"transcript": [{"text": "...", ...}, ...]}
+                transcript_segments = result.get("transcript", [])
+                if transcript_segments:
+                    transcription = " ".join(
+                        seg.get("text", "") for seg in transcript_segments
+                    ).strip()
+                else:
+                    transcription = result.get("transcription", result.get("text", ""))
+
+                logger().debug(
+                    f"speech_to_text_camb._transcribe: '{transcription[:80]}'"
+                )
+                return transcription
+            elif status in ("ERROR", "FAILED"):
+                raise RuntimeError(
+                    f"CAMB AI transcription failed for task {task_id}: {status_data}"
+                )
+
+            time.sleep(2)
+
+        raise TimeoutError(f"CAMB AI transcription timed out for task {task_id}")
+
+    def _get_audio_language(self, audio: array.array) -> str:
+        # CAMB AI doesn't have a dedicated language detection endpoint,
+        # so we default to English. For proper detection, use Whisper-based STT.
+        logger().debug(
+            "speech_to_text_camb._get_audio_language: defaulting to 'eng' (CAMB AI does not support language detection)"
+        )
+        return "eng"