Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions phoonnx/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@
noise_scale: float = DEFAULT_NOISE_SCALE
noise_w_scale: float = DEFAULT_NOISE_W_SCALE
add_diacritics: bool = None # arabic and hebrew
# diacritizer model name (for languages that need one — e.g. Arabic uses text2tashkeel models like "rawi-ensemble")
diacritizer_model: str = "rawi-ensemble"

# tokenization settings
tokenizer: Optional[TTSTokenizer] = None
Expand Down Expand Up @@ -256,6 +258,7 @@
phoneme_type = phoneme_type or config.get("phoneme_type")
alphabet = alphabet or config.get("alphabet")
diacritics = False
ar_diacritizer_model = "rawi-ensemble"

if VoiceConfig.is_phoonnx(config):
engine = engine or config.get("engine") or Engine.PHOONNX
Expand All @@ -264,6 +267,7 @@
phoneme_type = phoneme_type or config.get("phoneme_type", PhonemeType.ESPEAK)
alphabet = alphabet or Alphabet(config.get("alphabet", "ipa"))
diacritics = config.get("inference", {}).get("add_diacritics", True)
ar_diacritizer_model = config.get("inference", {}).get("diacritizer_model", "rawi-ensemble")

# Preserve the model's own special tokens when present (a native
# config may use any pad/blank/bos/eos); fall back to phoonnx defaults.
Expand Down Expand Up @@ -394,6 +398,7 @@
length_scale=inference.get("length_scale", DEFAULT_LENGTH_SCALE),
noise_w_scale=inference.get("noise_w", DEFAULT_NOISE_W_SCALE),
add_diacritics=diacritics,
diacritizer_model=ar_diacritizer_model,
lang_code=lang_code,
alphabet=Alphabet(alphabet) if isinstance(alphabet, str) else alphabet,
engine=Engine(engine) if isinstance(engine, str) else engine,
Expand Down Expand Up @@ -443,6 +448,7 @@
"lang_id_map": dict(self.lang_id_map or {}),
"phonemizer_model": self.phonemizer_model,
"add_diacritics": self.add_diacritics,
"diacritizer_model": self.diacritizer_model,
"inference": {
"noise_scale": self.noise_scale,
"length_scale": self.length_scale,
Expand Down Expand Up @@ -490,13 +496,16 @@
"""for arabic and hebrew models"""
add_diacritics: bool = True

# diacritizer model name (for languages that need one — e.g. Arabic uses text2tashkeel models like "rawi-ensemble")
diacritizer_model: str = "rawi-ensemble"

# Engine-specific per-call params (d_factor, p_factor, e_factor, …)
extra_params: Dict[str, Any] = field(default_factory=dict)


def get_phonemizer(phoneme_type: PhonemeType,
alphabet: Alphabet = Alphabet.IPA,
model: Optional[str] = None) -> 'Phonemizer':

Check failure on line 508 in phoonnx/config.py

View workflow job for this annotation

GitHub Actions / lint / lint

ruff (F821)

phoonnx/config.py:508:53: F821 Undefined name `Phonemizer`
"""
Create a phonemizer instance for the specified phonemeization strategy.

Expand Down
37 changes: 26 additions & 11 deletions phoonnx/phonemizers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from phoonnx.config import Alphabet
from phoonnx.util import normalize, match_lang
from phoonnx.thirdparty.phonikud import PhonikudDiacritizer
from phoonnx.thirdparty.tashkeel import TashkeelDiacritizer

# list of (substring, terminator, end_of_sentence) tuples.
TextChunks = List[Tuple[str, str, bool]]
Expand All @@ -21,25 +20,40 @@

class BasePhonemizer(metaclass=abc.ABCMeta):
def __init__(self, alphabet: Alphabet = Alphabet.UNICODE,
taskeen_threshold: Optional[float] = 0.8):
diacritizer_model: str = "rawi-ensemble"):
super().__init__()
self.alphabet = alphabet

self.taskeen_threshold = taskeen_threshold # arabic only
self._tashkeel: Optional[TashkeelDiacritizer] = None
# diacritizer model name, for languages that need one. Arabic uses
# text2tashkeel; the default "rawi-ensemble" restores hamza and the dagger
# alef in addition to the standard marks.
self.diacritizer_model = diacritizer_model
self._phonikud: Optional[PhonikudDiacritizer] = None # hebrew only
self._tashkeel: dict = {} # model name -> text2tashkeel Diacritizer

@property
def phonikud(self) -> PhonikudDiacritizer:
if self._phonikud is None:
self._phonikud = PhonikudDiacritizer()
return self._phonikud

@property
def tashkeel(self) -> TashkeelDiacritizer:
if self._tashkeel is None:
self._tashkeel = TashkeelDiacritizer()
return self._tashkeel
def tashkeel(self, model: Optional[str] = None):
"""Lazily build (and cache) the text2tashkeel Diacritizer used for Arabic.

text2tashkeel is a dependency of the ``[ar]`` extra; it restores hamza and the
dagger alef in addition to the standard marks. Install with
``pip install phoonnx[ar]`` (or ``pip install text2tashkeel``)."""
model = model or self.diacritizer_model
if model not in self._tashkeel:
try:
from text2tashkeel import Diacritizer
except ImportError as e:
raise ImportError(
"Arabic diacritization requires the text2tashkeel package: "
"pip install phoonnx[ar] (or pip install text2tashkeel)"
) from e
self._tashkeel[model] = Diacritizer(model)
return self._tashkeel[model]

@abc.abstractmethod
def phonemize_string(self, text: str, lang: str) -> str:
Expand All @@ -48,11 +62,12 @@ def phonemize_string(self, text: str, lang: str) -> str:
def phonemize_to_list(self, text: str, lang: str) -> List[str]:
return list(self.phonemize_string(text, lang))

def add_diacritics(self, text: str, lang: str) -> str:
def add_diacritics(self, text: str, lang: str,
model: Optional[str] = None) -> str:
if lang.startswith("he"):
return self.phonikud.diacritize(text)
elif lang.startswith("ar"):
return self.tashkeel.diacritize(text, self.taskeen_threshold)
return self.tashkeel(model).diacritize(text)
return text

def phonemize(self, text: str, lang: str) -> PhonemizedChunks:
Expand Down
22 changes: 0 additions & 22 deletions phoonnx/thirdparty/tashkeel/LICENSE

This file was deleted.

1 change: 0 additions & 1 deletion phoonnx/thirdparty/tashkeel/SOURCE

This file was deleted.

212 changes: 0 additions & 212 deletions phoonnx/thirdparty/tashkeel/__init__.py

This file was deleted.

18 changes: 0 additions & 18 deletions phoonnx/thirdparty/tashkeel/hint_id_map.json

This file was deleted.

Loading
Loading