Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,14 @@ conda env config vars set OPENAI_API_KEY="your-api-key-here"
conda deactivate
```

- To use [MiniMax](https://www.minimaxi.com/) for translation (MiniMax-M2.5, MiniMax-M2.7) or TTS (speech-2.8-hd with 12 voices), set up your MiniMax API key:

```
conda activate sonitr
conda env config vars set MINIMAX_API_KEY="your-minimax-api-key-here"
conda deactivate
```

## Command line arguments

The app_rvc.py script supports command-line arguments to customize its behavior. Here's a brief guide on how to use them:
Expand Down Expand Up @@ -363,6 +371,7 @@ This project leverages a number of open-source projects. We would like to acknow
- [Coqui TTS](https://github.com/coqui-ai/TTS)
- [pypdf](https://github.com/py-pdf/pypdf)
- [OpenVoice](https://github.com/myshell-ai/OpenVoice)
- [MiniMax](https://www.minimaxi.com/) - LLM translation and TTS provider

## License
Although the code is licensed under Apache 2, the models or weights may have commercial restrictions, as seen with pyannote diarization.
18 changes: 18 additions & 0 deletions app_rvc.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
BARK_VOICES_LIST,
VITS_VOICES_LIST,
OPENAI_TTS_MODELS,
MINIMAX_TTS_MODELS,
)
from soni_translate.utils import (
remove_files,
Expand Down Expand Up @@ -120,6 +121,7 @@ def __init__(self, piper_enabled, xtts_enabled):
self.list_bark = list(BARK_VOICES_LIST.keys())
self.list_vits = list(VITS_VOICES_LIST.keys())
self.list_openai_tts = OPENAI_TTS_MODELS
self.list_minimax_tts = MINIMAX_TTS_MODELS
self.piper_enabled = piper_enabled
self.list_vits_onnx = (
piper_tts_voices_list() if self.piper_enabled else []
Expand All @@ -135,6 +137,7 @@ def tts_list(self):
+ self.list_bark
+ self.list_vits
+ self.list_openai_tts
+ self.list_minimax_tts
+ self.list_vits_onnx
)
return list_tts
Expand Down Expand Up @@ -268,6 +271,16 @@ def check_openai_api_key():
)


def check_minimax_api_key():
if not os.environ.get("MINIMAX_API_KEY"):
raise ValueError(
"To use MiniMax for translation or TTS, please set up your "
"MiniMax API key as an environment variable: "
"export MINIMAX_API_KEY='your-api-key-here'. Or change the "
"translation process / TTS voice in settings."
)


class SoniTranslate(SoniTrCache):
def __init__(self, cpu_mode=False):
super().__init__()
Expand Down Expand Up @@ -453,6 +466,9 @@ def multilingual_media_conversion(
):
check_openai_api_key()

if "MiniMax" in translate_process or "MiniMax-TTS" in tts_voice00:
check_minimax_api_key()

if media_file is None:
media_file = (
directory_input
Expand Down Expand Up @@ -1276,6 +1292,8 @@ def multilingual_docs_conversion(
):
if "gpt" in translate_process:
check_openai_api_key()
if "MiniMax" in translate_process or "MiniMax-TTS" in tts_voice00:
check_minimax_api_key()

SOURCE_LANGUAGE = LANGUAGES[origin_language]
if translate_process != "disable_translation":
Expand Down
15 changes: 15 additions & 0 deletions soni_translate/language_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,21 @@ def fix_code_language(translate_to, syntax="google"):
">shimmer HD OpenAI-TTS"
]

MINIMAX_TTS_MODELS = [
">Wise_Woman MiniMax-TTS",
">Deep_Voice_Man MiniMax-TTS",
">Friendly_Person MiniMax-TTS",
">Inspirational_girl MiniMax-TTS",
">sweet_girl MiniMax-TTS",
">cute_boy MiniMax-TTS",
">lovely_girl MiniMax-TTS",
">English_Graceful_Lady MiniMax-TTS",
">English_Insightful_Speaker MiniMax-TTS",
">English_radiant_girl MiniMax-TTS",
">English_Persuasive_Man MiniMax-TTS",
">English_Lucky_Robot MiniMax-TTS",
]

LANGUAGE_CODE_IN_THREE_LETTERS = {
"Automatic detection": "aut",
"ar": "ara",
Expand Down
95 changes: 93 additions & 2 deletions soni_translate/text_to_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import edge_tts, asyncio, json, glob # noqa
from tqdm import tqdm
import librosa, os, re, torch, gc, subprocess # noqa
import requests
from .language_configuration import (
fix_code_language,
BARK_VOICES_LIST,
Expand Down Expand Up @@ -939,6 +940,86 @@ def segments_openai_tts(
error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)


# =====================================
# MINIMAX TTS
# =====================================


def segments_minimax_tts(filtered_minimax_tts_segments, TRANSLATE_AUDIO_TO):
api_key = os.environ.get("MINIMAX_API_KEY")
if not api_key:
raise TTS_OperationError(
"MINIMAX_API_KEY environment variable is not set. "
"Please set it to use MiniMax TTS."
)

sampling_rate = 24000

for segment in tqdm(filtered_minimax_tts_segments["segments"]):
speaker = segment["speaker"] # noqa
text = segment["text"].strip()
start = segment["start"]
tts_name = segment["tts_name"]

# Extract voice_id from tts_name (e.g. ">Wise_Woman MiniMax-TTS")
voice_id = tts_name.split()[0][1:]

# make the tts audio
filename = f"audio/{start}.ogg"
logger.info(f"{text} >> {filename}")

try:
response = requests.post(
"https://api.minimax.io/v1/t2a_v2",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
json={
"model": "speech-2.8-hd",
"text": text,
"voice_setting": {
"voice_id": voice_id,
"speed": 1.0,
},
"audio_setting": {
"format": "mp3",
},
},
timeout=60,
)
response.raise_for_status()
result = response.json()

if "data" not in result or "audio" not in result["data"]:
raise TTS_OperationError(
f"MiniMax TTS returned unexpected response: {result}"
)

audio_hex = result["data"]["audio"]
audio_bytes = bytes.fromhex(audio_hex)

# Write mp3 to temp file, then read and convert
temp_file = filename[:-3] + "mp3"
with open(temp_file, "wb") as f:
f.write(audio_bytes)

data, sample_rate = sf.read(temp_file)
data = pad_array(data, sample_rate)

write_chunked(
file=filename,
samplerate=sample_rate,
data=data,
format="ogg",
subtype="vorbis",
)
verify_saved_file_and_size(filename)

except Exception as error:
error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)


# =====================================
# Select task TTS
# =====================================
Expand Down Expand Up @@ -1022,6 +1103,7 @@ def audio_segmentation_to_voice(
pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
pattern_vits_onnx = re.compile(r".* VITS-onnx$")
pattern_openai_tts = re.compile(r".* OpenAI-TTS$")
pattern_minimax_tts = re.compile(r".* MiniMax-TTS$")

all_segments = result_diarize["segments"]

Expand All @@ -1035,6 +1117,9 @@ def audio_segmentation_to_voice(
speakers_openai_tts = find_spkr(
pattern_openai_tts, speaker_to_voice, all_segments
)
speakers_minimax_tts = find_spkr(
pattern_minimax_tts, speaker_to_voice, all_segments
)

# Filter method in segments
filtered_edge = filter_by_speaker(speakers_edge, all_segments)
Expand All @@ -1043,6 +1128,7 @@ def audio_segmentation_to_voice(
filtered_coqui = filter_by_speaker(speakers_coqui, all_segments)
filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments)
filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments)
filtered_minimax_tts = filter_by_speaker(speakers_minimax_tts, all_segments)

# Infer
if filtered_edge["segments"]:
Expand Down Expand Up @@ -1072,6 +1158,9 @@ def audio_segmentation_to_voice(
if filtered_openai_tts["segments"]:
logger.info(f"OpenAI TTS: {speakers_openai_tts}")
segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO) # wav
if filtered_minimax_tts["segments"]:
logger.info(f"MiniMax TTS: {speakers_minimax_tts}")
segments_minimax_tts(filtered_minimax_tts, TRANSLATE_AUDIO_TO)

[result.pop("tts_name", None) for result in result_diarize["segments"]]
return [
Expand All @@ -1080,7 +1169,8 @@ def audio_segmentation_to_voice(
speakers_vits,
speakers_coqui,
speakers_vits_onnx,
speakers_openai_tts
speakers_openai_tts,
speakers_minimax_tts,
]


Expand All @@ -1099,7 +1189,8 @@ def accelerate_segments(
speakers_vits,
speakers_coqui,
speakers_vits_onnx,
speakers_openai_tts
speakers_openai_tts,
speakers_minimax_tts,
) = valid_speakers

create_directories(f"{folder_output}/audio/")
Expand Down
51 changes: 47 additions & 4 deletions soni_translate/translate_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from deep_translator import GoogleTranslator
from itertools import chain
import copy
import os
from .language_configuration import fix_code_language, INVERTED_LANGUAGES
from .logging_setup import logger
import re
Expand All @@ -15,12 +16,18 @@
"gpt-3.5-turbo-0125",
"gpt-4-turbo-preview_batch",
"gpt-4-turbo-preview",
"MiniMax-M2.5_batch",
"MiniMax-M2.5",
"MiniMax-M2.7_batch",
"MiniMax-M2.7",
"disable_translation",
]
DOCS_TRANSLATION_PROCESS_OPTIONS = [
"google_translator",
"gpt-3.5-turbo-0125",
"gpt-4-turbo-preview",
"MiniMax-M2.5",
"MiniMax-M2.7",
"disable_translation",
]

Expand Down Expand Up @@ -213,6 +220,11 @@ def call_gpt_translate(
]
)
result = response.choices[0].message.content
# Strip thinking tags (e.g. from MiniMax models) before parsing
result = re.sub(r"<think>.*?</think>\s*", "", result, flags=re.DOTALL)
# Strip markdown code fences if present
result = re.sub(r"^```(?:json)?\s*\n?", "", result.strip(), flags=re.MULTILINE)
result = re.sub(r"\n?```\s*$", "", result.strip(), flags=re.MULTILINE)
logger.debug(f"Result: {str(result)}")

try:
Expand Down Expand Up @@ -267,12 +279,28 @@ def call_gpt_translate(
return translation


def gpt_sequential(segments, model, target, source=None):
def _create_minimax_client():
"""Create an OpenAI-compatible client for MiniMax API."""
from openai import OpenAI

api_key = os.environ.get("MINIMAX_API_KEY")
if not api_key:
raise ValueError(
"To use MiniMax for translation, please set up your MiniMax API "
"key as an environment variable: "
"export MINIMAX_API_KEY='your-api-key-here'. Or change the "
"translation process in Advanced settings."
)
return OpenAI(api_key=api_key, base_url="https://api.minimax.io/v1")


def gpt_sequential(segments, model, target, source=None, client=None):
from openai import OpenAI

translated_segments = copy.deepcopy(segments)

client = OpenAI()
if client is None:
client = OpenAI()
progress_bar = tqdm(total=len(segments), desc="Translating")

lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip()
Expand Down Expand Up @@ -318,15 +346,16 @@ def gpt_sequential(segments, model, target, source=None):
return translated_segments


def gpt_batch(segments, model, target, token_batch_limit=900, source=None):
def gpt_batch(segments, model, target, token_batch_limit=900, source=None, client=None):
from openai import OpenAI
import tiktoken

token_batch_limit = max(100, (token_batch_limit - 40) // 2)
progress_bar = tqdm(total=len(segments), desc="Translating")
segments_copy = copy.deepcopy(segments)
encoding = tiktoken.get_encoding("cl100k_base")
client = OpenAI()
if client is None:
client = OpenAI()

lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip()
lang_sc = ""
Expand Down Expand Up @@ -451,6 +480,20 @@ def translate_text(
token_batch_limit,
source
)
case model if model in ["MiniMax-M2.5", "MiniMax-M2.7"]:
return gpt_sequential(
segments, model, target, source,
client=_create_minimax_client()
)
case model if model in ["MiniMax-M2.5_batch", "MiniMax-M2.7_batch"]:
return gpt_batch(
segments,
translation_process.replace("_batch", ""),
target,
token_batch_limit,
source,
client=_create_minimax_client()
)
case "disable_translation":
return segments
case _:
Expand Down
Empty file added tests/__init__.py
Empty file.
Loading