R3gm · octo-patch · Mar 24, 2026
diff --git a/README.md b/README.md
@@ -283,6 +283,14 @@ conda env config vars set OPENAI_API_KEY="your-api-key-here"
 conda deactivate
 ```
 
+-  To use [MiniMax](https://www.minimaxi.com/) for translation (MiniMax-M2.5, MiniMax-M2.7) or TTS (speech-2.8-hd with 12 voices), set up your MiniMax API key:
+
+```
+conda activate sonitr
+conda env config vars set MINIMAX_API_KEY="your-minimax-api-key-here"
+conda deactivate
+```
+
 ## Command line arguments
 
 The app_rvc.py script supports command-line arguments to customize its behavior. Here's a brief guide on how to use them:
@@ -363,6 +371,7 @@ This project leverages a number of open-source projects. We would like to acknow
 - [Coqui TTS](https://github.com/coqui-ai/TTS)
 - [pypdf](https://github.com/py-pdf/pypdf)
 - [OpenVoice](https://github.com/myshell-ai/OpenVoice)
+- [MiniMax](https://www.minimaxi.com/) - LLM translation and TTS provider
 
 ## License
 Although the code is licensed under Apache 2, the models or weights may have commercial restrictions, as seen with pyannote diarization.
diff --git a/app_rvc.py b/app_rvc.py
@@ -40,6 +40,7 @@
     BARK_VOICES_LIST,
     VITS_VOICES_LIST,
     OPENAI_TTS_MODELS,
+    MINIMAX_TTS_MODELS,
 )
 from soni_translate.utils import (
     remove_files,
@@ -120,6 +121,7 @@ def __init__(self, piper_enabled, xtts_enabled):
         self.list_bark = list(BARK_VOICES_LIST.keys())
         self.list_vits = list(VITS_VOICES_LIST.keys())
         self.list_openai_tts = OPENAI_TTS_MODELS
+        self.list_minimax_tts = MINIMAX_TTS_MODELS
         self.piper_enabled = piper_enabled
         self.list_vits_onnx = (
             piper_tts_voices_list() if self.piper_enabled else []
@@ -135,6 +137,7 @@ def tts_list(self):
             + self.list_bark
             + self.list_vits
             + self.list_openai_tts
+            + self.list_minimax_tts
             + self.list_vits_onnx
         )
         return list_tts
@@ -268,6 +271,16 @@ def check_openai_api_key():
         )
 
 
+def check_minimax_api_key():
+    if not os.environ.get("MINIMAX_API_KEY"):
+        raise ValueError(
+            "To use MiniMax for translation or TTS, please set up your "
+            "MiniMax API key as an environment variable: "
+            "export MINIMAX_API_KEY='your-api-key-here'. Or change the "
+            "translation process / TTS voice in settings."
+        )
+
+
 class SoniTranslate(SoniTrCache):
     def __init__(self, cpu_mode=False):
         super().__init__()
@@ -453,6 +466,9 @@ def multilingual_media_conversion(
         ):
             check_openai_api_key()
 
+        if "MiniMax" in translate_process or "MiniMax-TTS" in tts_voice00:
+            check_minimax_api_key()
+
         if media_file is None:
             media_file = (
                 directory_input
@@ -1276,6 +1292,8 @@ def multilingual_docs_conversion(
     ):
         if "gpt" in translate_process:
             check_openai_api_key()
+        if "MiniMax" in translate_process or "MiniMax-TTS" in tts_voice00:
+            check_minimax_api_key()
 
         SOURCE_LANGUAGE = LANGUAGES[origin_language]
         if translate_process != "disable_translation":

diff --git a/soni_translate/language_configuration.py b/soni_translate/language_configuration.py
@@ -505,6 +505,21 @@ def fix_code_language(translate_to, syntax="google"):
     ">shimmer HD OpenAI-TTS"
 ]
 
+MINIMAX_TTS_MODELS = [
+    ">Wise_Woman MiniMax-TTS",
+    ">Deep_Voice_Man MiniMax-TTS",
+    ">Friendly_Person MiniMax-TTS",
+    ">Inspirational_girl MiniMax-TTS",
+    ">sweet_girl MiniMax-TTS",
+    ">cute_boy MiniMax-TTS",
+    ">lovely_girl MiniMax-TTS",
+    ">English_Graceful_Lady MiniMax-TTS",
+    ">English_Insightful_Speaker MiniMax-TTS",
+    ">English_radiant_girl MiniMax-TTS",
+    ">English_Persuasive_Man MiniMax-TTS",
+    ">English_Lucky_Robot MiniMax-TTS",
+]
+
 LANGUAGE_CODE_IN_THREE_LETTERS = {
     "Automatic detection": "aut",
     "ar": "ara",

diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py
@@ -2,6 +2,7 @@
 import edge_tts, asyncio, json, glob # noqa
 from tqdm import tqdm
 import librosa, os, re, torch, gc, subprocess # noqa
+import requests
 from .language_configuration import (
     fix_code_language,
     BARK_VOICES_LIST,
@@ -939,6 +940,86 @@ def segments_openai_tts(
             error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
 
 
+# =====================================
+# MINIMAX TTS
+# =====================================
+
+
+def segments_minimax_tts(filtered_minimax_tts_segments, TRANSLATE_AUDIO_TO):
+    api_key = os.environ.get("MINIMAX_API_KEY")
+    if not api_key:
+        raise TTS_OperationError(
+            "MINIMAX_API_KEY environment variable is not set. "
+            "Please set it to use MiniMax TTS."
+        )
+
+    sampling_rate = 24000
+
+    for segment in tqdm(filtered_minimax_tts_segments["segments"]):
+        speaker = segment["speaker"]  # noqa
+        text = segment["text"].strip()
+        start = segment["start"]
+        tts_name = segment["tts_name"]
+
+        # Extract voice_id from tts_name (e.g. ">Wise_Woman MiniMax-TTS")
+        voice_id = tts_name.split()[0][1:]
+
+        # make the tts audio
+        filename = f"audio/{start}.ogg"
+        logger.info(f"{text} >> {filename}")
+
+        try:
+            response = requests.post(
+                "https://api.minimax.io/v1/t2a_v2",
+                headers={
+                    "Authorization": f"Bearer {api_key}",
+                    "Content-Type": "application/json",
+                },
+                json={
+                    "model": "speech-2.8-hd",
+                    "text": text,
+                    "voice_setting": {
+                        "voice_id": voice_id,
+                        "speed": 1.0,
+                    },
+                    "audio_setting": {
+                        "format": "mp3",
+                    },
+                },
+                timeout=60,
+            )
+            response.raise_for_status()
+            result = response.json()
+
+            if "data" not in result or "audio" not in result["data"]:
+                raise TTS_OperationError(
+                    f"MiniMax TTS returned unexpected response: {result}"
+                )
+
+            audio_hex = result["data"]["audio"]
+            audio_bytes = bytes.fromhex(audio_hex)
+
+            # Write mp3 to temp file, then read and convert
+            temp_file = filename[:-3] + "mp3"
+            with open(temp_file, "wb") as f:
+                f.write(audio_bytes)
+
+            data, sample_rate = sf.read(temp_file)
+            data = pad_array(data, sample_rate)
+
+            write_chunked(
+                file=filename,
+                samplerate=sample_rate,
+                data=data,
+                format="ogg",
+                subtype="vorbis",
+            )
+            verify_saved_file_and_size(filename)
+
+        except Exception as error:
+            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
+
+
 # =====================================
 # Select task TTS
 # =====================================
@@ -1022,6 +1103,7 @@ def audio_segmentation_to_voice(
     pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
     pattern_vits_onnx = re.compile(r".* VITS-onnx$")
     pattern_openai_tts = re.compile(r".* OpenAI-TTS$")
+    pattern_minimax_tts = re.compile(r".* MiniMax-TTS$")
 
     all_segments = result_diarize["segments"]
 
@@ -1035,6 +1117,9 @@ def audio_segmentation_to_voice(
     speakers_openai_tts = find_spkr(
         pattern_openai_tts, speaker_to_voice, all_segments
     )
+    speakers_minimax_tts = find_spkr(
+        pattern_minimax_tts, speaker_to_voice, all_segments
+    )
 
     # Filter method in segments
     filtered_edge = filter_by_speaker(speakers_edge, all_segments)
@@ -1043,6 +1128,7 @@ def audio_segmentation_to_voice(
     filtered_coqui = filter_by_speaker(speakers_coqui, all_segments)
     filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments)
     filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments)
+    filtered_minimax_tts = filter_by_speaker(speakers_minimax_tts, all_segments)
 
     # Infer
     if filtered_edge["segments"]:
@@ -1072,6 +1158,9 @@ def audio_segmentation_to_voice(
     if filtered_openai_tts["segments"]:
         logger.info(f"OpenAI TTS: {speakers_openai_tts}")
         segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO)  # wav
+    if filtered_minimax_tts["segments"]:
+        logger.info(f"MiniMax TTS: {speakers_minimax_tts}")
+        segments_minimax_tts(filtered_minimax_tts, TRANSLATE_AUDIO_TO)
 
     [result.pop("tts_name", None) for result in result_diarize["segments"]]
     return [
@@ -1080,7 +1169,8 @@ def audio_segmentation_to_voice(
         speakers_vits,
         speakers_coqui,
         speakers_vits_onnx,
-        speakers_openai_tts
+        speakers_openai_tts,
+        speakers_minimax_tts,
     ]
 
 
@@ -1099,7 +1189,8 @@ def accelerate_segments(
         speakers_vits,
         speakers_coqui,
         speakers_vits_onnx,
-        speakers_openai_tts
+        speakers_openai_tts,
+        speakers_minimax_tts,
     ) = valid_speakers
 
     create_directories(f"{folder_output}/audio/")

diff --git a/soni_translate/translate_segments.py b/soni_translate/translate_segments.py
@@ -2,6 +2,7 @@
 from deep_translator import GoogleTranslator
 from itertools import chain
 import copy
+import os
 from .language_configuration import fix_code_language, INVERTED_LANGUAGES
 from .logging_setup import logger
 import re
@@ -15,12 +16,18 @@
     "gpt-3.5-turbo-0125",
     "gpt-4-turbo-preview_batch",
     "gpt-4-turbo-preview",
+    "MiniMax-M2.5_batch",
+    "MiniMax-M2.5",
+    "MiniMax-M2.7_batch",
+    "MiniMax-M2.7",
     "disable_translation",
 ]
 DOCS_TRANSLATION_PROCESS_OPTIONS = [
     "google_translator",
     "gpt-3.5-turbo-0125",
     "gpt-4-turbo-preview",
+    "MiniMax-M2.5",
+    "MiniMax-M2.7",
     "disable_translation",
 ]
 
@@ -213,6 +220,11 @@ def call_gpt_translate(
         ]
     )
     result = response.choices[0].message.content
+    # Strip thinking tags (e.g. from MiniMax models) before parsing
+    result = re.sub(r"<think>.*?</think>\s*", "", result, flags=re.DOTALL)
+    # Strip markdown code fences if present
+    result = re.sub(r"^```(?:json)?\s*\n?", "", result.strip(), flags=re.MULTILINE)
+    result = re.sub(r"\n?```\s*$", "", result.strip(), flags=re.MULTILINE)
     logger.debug(f"Result: {str(result)}")
 
     try:
@@ -267,12 +279,28 @@ def call_gpt_translate(
         return translation
 
 
-def gpt_sequential(segments, model, target, source=None):
+def _create_minimax_client():
+    """Create an OpenAI-compatible client for MiniMax API."""
+    from openai import OpenAI
+
+    api_key = os.environ.get("MINIMAX_API_KEY")
+    if not api_key:
+        raise ValueError(
+            "To use MiniMax for translation, please set up your MiniMax API "
+            "key as an environment variable: "
+            "export MINIMAX_API_KEY='your-api-key-here'. Or change the "
+            "translation process in Advanced settings."
+        )
+    return OpenAI(api_key=api_key, base_url="https://api.minimax.io/v1")
+
+
+def gpt_sequential(segments, model, target, source=None, client=None):
     from openai import OpenAI
 
     translated_segments = copy.deepcopy(segments)
 
-    client = OpenAI()
+    if client is None:
+        client = OpenAI()
     progress_bar = tqdm(total=len(segments), desc="Translating")
 
     lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip()
@@ -318,15 +346,16 @@ def gpt_sequential(segments, model, target, source=None):
     return translated_segments
 
 
-def gpt_batch(segments, model, target, token_batch_limit=900, source=None):
+def gpt_batch(segments, model, target, token_batch_limit=900, source=None, client=None):
     from openai import OpenAI
     import tiktoken
 
     token_batch_limit = max(100, (token_batch_limit - 40) // 2)
     progress_bar = tqdm(total=len(segments), desc="Translating")
     segments_copy = copy.deepcopy(segments)
     encoding = tiktoken.get_encoding("cl100k_base")
-    client = OpenAI()
+    if client is None:
+        client = OpenAI()
 
     lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip()
     lang_sc = ""
@@ -451,6 +480,20 @@ def translate_text(
                 token_batch_limit,
                 source
             )
+        case model if model in ["MiniMax-M2.5", "MiniMax-M2.7"]:
+            return gpt_sequential(
+                segments, model, target, source,
+                client=_create_minimax_client()
+            )
+        case model if model in ["MiniMax-M2.5_batch", "MiniMax-M2.7_batch"]:
+            return gpt_batch(
+                segments,
+                translation_process.replace("_batch", ""),
+                target,
+                token_batch_limit,
+                source,
+                client=_create_minimax_client()
+            )
         case "disable_translation":
             return segments
         case _:

diff --git a/tests/__init__.py b/tests/__init__.py