diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c74ad51 --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2017, Josef Novak +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/bin/phonetisaurus-align.cc b/src/bin/phonetisaurus-align.cc index 7c44d0f..e26fda2 100644 --- a/src/bin/phonetisaurus-align.cc +++ b/src/bin/phonetisaurus-align.cc @@ -63,8 +63,7 @@ int load_input_file (M2MFstAligner* aligner, string input_file, lines++; } infile.close (); - } - else { + } else { cerr << "Failed to open input file: " << input_file << endl; return -1; } @@ -213,7 +212,7 @@ void compileNBestFarArchive (M2MFstAligner* aligner, set_syms = true; } - sprintf (keybuf, "%0*d", generate_keys, i+1); + snsprintf (keybuf, "%0*d", generate_keys, i+1); key = keybuf; //Write the final result to the FARchive diff --git a/src/bin/phonetisaurus_apply b/src/bin/phonetisaurus_apply new file mode 100755 index 0000000..66c22f0 --- /dev/null +++ b/src/bin/phonetisaurus_apply @@ -0,0 +1,308 @@ +#!/usr/bin/env python +# -*- mode: python; coding: utf-8 -*- +from __future__ import print_function +import os, logging, subprocess, time, re +from datetime import datetime +from collections import defaultdict +import tempfile + +class G2PModelTester () : + """G2P Model training wrapper class. + + Phonetisaurus G2P modeling training wrapper class. + This wraps the alignment, joint n-gram training, and ARPA to + WFST conversion steps into one command. + """ + + def __init__ (self, model, **kwargs) : + self.model = model + self.lexicon_file = kwargs.get ("lexicon", None) + self.nbest = kwargs.get ("nbest", 1) + self.thresh = kwargs.get ("thresh", 99) + self.beam = kwargs.get ("beam", 10000) + self.greedy = kwargs.get ("greedy", False) + self.verbose = kwargs.get ("verbose", False) + self.logger = self.setupLogger () + + def setupLogger (self) : + """Setup the logger and logging level. + + Setup the logger and logging level. We only support + verbose and non-verbose mode. + + Args: + verbose (bool): Verbose mode, or not. + + Returns: + Logger: A configured logger instance. + """ + + level = logging.DEBUG if self.verbose else logging.INFO + logging.basicConfig ( + level=level, + format="\033[94m%(levelname)s:%(name)s:"\ + "%(asctime)s\033[0m: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S" + ) + + return logging.getLogger ("phonetisaurus-apply") + + def _loadLexicon (self) : + """Load the lexicon from a file. + + Load the reference lexicon from a file, and store it + in a defaultdict (list). + """ + + _lexicon = defaultdict (list) + if not self.lexicon_file : + return _lexicon + + self.logger.debug ("Loading lexicon from file...") + with open (self.lexicon_file, "r") as ifp : + for line in ifp : + line = line.decode ("utf8").strip () + word, pron = re.split (ur"\t", line) + _lexicon [word].append (pron) + + return _lexicon + + def checkPhonetisaurusConfig (self) : + """Run some basic checks before training. + + Run some basic checks regarding the $PATH, environment, + and provided data before starting training. + + Raises: + EnvironmentError: raised if binaries are not found. + """ + + self.logger.debug ("Checking command configuration...") + for program in ["phonetisaurus-g2pfst"] : + if not self.which (program) : + raise EnvironmentError, "Phonetisaurus command, '{0}', "\ + "not found in path.".format (program) + + if self.lexicon_file and not os.path.exists (self.lexicon_file) : + self.logger.error ("Could not find provided lexicon file.") + sys.exit (1) + + for key,val in sorted (vars (self).iteritems ()) : + self.logger.debug (u"{0}: {1}".format (key, val)) + + self.lexicon = self._loadLexicon () + + return + + def which (self, program) : + """Basic 'which' implementation for python. + + Basic 'which' implementation for python from stackoverflow: + * https://stackoverflow.com/a/377028/6739158 + + Args: + program (str): The program name to search the $PATH for. + + Returns: + path/None: The path to the executable, or None. + """ + + def is_exe (fpath) : + return os.path.isfile (fpath) and os.access (fpath, os.X_OK) + + fpath, fname = os.path.split (program) + if fpath: + if is_exe (program): + return program + else: + for path in os.environ["PATH"].split (os.pathsep) : + path = path.strip ('"') + exe_file = os.path.join (path, program) + if is_exe (exe_file): + return exe_file + + return None + + def makeG2PCommand (self, word_list) : + """Build the G2P command. + + Build the G2P command from the provided arguments. + + Returns: + list: The command in subprocess list format. + """ + + command = [ + u"phonetisaurus-g2pfst", + u"--model={0}".format (self.model), + u"--nbest={0}".format (self.nbest), + u"--beam={0}".format (self.beam), + u"--thresh={0}".format (self.thresh), + u"--wordlist={0}".format (word_list) + ] + + self.logger.debug (u" ".join (command)) + + return command + + def runG2PCommand (self, word_list_file) : + """Generate and run the actual G2P command. + + Generate and run the actual G2P command. Each synthesized + entry will be yielded back on-the-fly via the subprocess + stdout readline method. + + Args: + word_list_file (str): The input word list. + """ + g2p_command = self.makeG2PCommand (word_list_file) + + self.logger.debug ("Applying G2P model...") + + with open (os.devnull, "w") as devnull : + proc = subprocess.Popen ( + g2p_command, + stdout=subprocess.PIPE, + stderr=devnull if not self.verbose else None + ) + + for line in iter (proc.stdout.readline, "") : + parts = re.split (ur"\t", line.decode ("utf8").strip ()) + if not len (parts) == 3 : + self.logger.warning ( + u"No pronunciation for word: '{0}'".format (parts [0]) + ) + continue + + yield parts + + return + + def applyG2POnly (self, word_list_file) : + """Apply the G2P model to a word list. + + Apply the G2P model to a word list. No filtering or application + of a reference lexicon is used here. + + Args: + word_list_file (str): The input word list. + """ + for word, score, pron in self.runG2PCommand (word_list_file) : + line = u"" + if self.verbose : + line = u"{0}\t{1:.2f}\t{2}".format ( + word, float (score), pron + ) + else : + line = u"{0}\t{1}".format (word, pron) + print (line.encode ("utf8")) + + return + + def applyG2PWithLexicon (self, word_list_file) : + """Apply the G2P model to a word list, combined with lexicon. + + Apply the G2P model to a word list, but combine this with + a reference lexicon. Words for which a reference entry exists + will not be sent to the G2P, unless the additional '--greedy' + flag is set to True. + + Args: + word_list_file (str): The input word list. + """ + target_lexicon = defaultdict (list) + tmpwordlist = tempfile.NamedTemporaryFile (delete=False) + + #First, find any words in the target list for which we already + # have a canonical pronunciation in the reference lexicon. + with open (word_list_file, "r") as ifp : + for word in ifp : + word = word.decode ("utf8").strip () + if word in self.lexicon : + target_lexicon [word] = [(0.0,pron) + for pron in self.lexicon [word]] + #In greedy mode we still send words to the G2P, even + # if we have canonical entries in the reference lexicon. + if self.greedy : + print (word.encode ("utf8"), file=tmpwordlist) + else : + print (word.encode ("utf8"), file=tmpwordlist) + tmpwordlist.close () + + #Second, iterate through the G2P output, and filter against + # any possible duplicates previously found in the reference lexicon. + for word, score, pron in self.runG2PCommand (tmpwordlist.name) : + prons = set ([p for s,p in target_lexicon [word]]) + if pron in prons : + continue + target_lexicon [word].append ((score, pron)) + + #Finally, sort everything that is left and print it. + for word in sorted (target_lexicon.keys ()) : + for score, pron in target_lexicon [word] : + line = u"" + if self.verbose : + line = u"{0}\t{1:.2f}\t{2}".format ( + word, float (score), pron + ) + else : + line = u"{0}\t{1}".format (word, pron) + print (line.encode ("utf8")) + + os.unlink (tmpwordlist.name) + return + + def ApplyG2PModel (self, word_list_file) : + """Apply the G2P model to a word list. + + Apply the G2P model to a word list. + + Args: + word_list_file (str): The input word list. + """ + self.checkPhonetisaurusConfig () + + if not os.path.exists (word_list_file) \ + or not os.path.isfile (word_list_file) : + raise IOError, "Word list file not found." + + if len (self.lexicon) == 0 : + self.applyG2POnly (word_list_file) + else : + self.applyG2PWithLexicon (word_list_file) + + return + +if __name__ == "__main__" : + import sys, argparse + + example = "{0} --model train/model.fst --word test".format (sys.argv [0]) + + parser = argparse.ArgumentParser (description=example) + parser.add_argument ("--model", "-m", help="Phonetisaurus G2P fst model.", + required=True) + parser.add_argument ("--lexicon", "-l", help="Optional reference lexicon.", + required=False) + parser.add_argument ("--nbest", "-n", help="Nbest highest order.", + default=1, type=int) + parser.add_argument ("--beam", "-b", help="Search 'beam'.", + default=10000, type=int) + parser.add_argument ("--thresh", "-t", help="Pruning threshold for n-best.", + default=99.0, type=float) + parser.add_argument ("--greedy", "-g", help="Use the G2P even if a " + "reference lexicon has been provided.", default=False, + action="store_true") + parser.add_argument ("--word_list", "-wl", help="Input word or word list to apply " + "G2P model to.", type=str) + + parser.add_argument ("--verbose", "-v", help="Verbose mode.", + default=False, action="store_true") + args = parser.parse_args () + + tester = G2PModelTester ( + args.model, + **{key:val for key,val in args.__dict__.iteritems () + if not key in ["model","word_list"]} + ) + + tester.ApplyG2PModel (args.word_list) diff --git a/src/bin/phonetisaurus_train b/src/bin/phonetisaurus_train new file mode 100755 index 0000000..fcbc408 --- /dev/null +++ b/src/bin/phonetisaurus_train @@ -0,0 +1,350 @@ +#!/usr/bin/env python +# -*- mode: python; coding: utf-8 -*- +from __future__ import print_function +import os, logging, subprocess, time, re +from datetime import datetime + + +class G2PModelTrainer () : + """G2P Model training wrapper class. + + Phonetisaurus G2P modeling training wrapper class. + This wraps the alignment, joint n-gram training, and ARPA to + WFST conversion steps into one command. + """ + + def __init__ (self, lexicon_file, **kwargs) : + self.lexicon_file = lexicon_file + self.model_prefix = kwargs.get ("model_prefix", "model") + self.dir_prefix = kwargs.get ("dir_prefix", "train") + self.ngram_order = kwargs.get ("ngram_order", 8) + self.seq1_max = kwargs.get ("seq1_max", 2) + self.seq2_max = kwargs.get ("seq2_max", 2) + self.seq1_del = kwargs.get ("seq1_del", False) + self.seq2_del = kwargs.get ("seq2_del", False) + self.verbose = kwargs.get ("verbose", False) + self.logger = self.setupLogger () + self.makeJointNgramCommand = self._setLMCommand ( + kwargs.get ("lm", "mitlm") + ) + + def setupLogger (self) : + """Setup the logger and logging level. + + Setup the logger and logging level. We only support + verbose and non-verbose mode. + + Args: + verbose (bool): Verbose mode, or not. + + Returns: + Logger: A configured logger instance. + """ + + level = logging.DEBUG if self.verbose else logging.INFO + logging.basicConfig ( + level=level, + format="\033[94m%(levelname)s:%(name)s:"\ + "%(asctime)s\033[0m: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S" + ) + + return logging.getLogger ("phonetisaurus-train") + + def validateLexicon (self) : + """Validate the input training lexicon. + + Validate the input training lexicon. At present + this simply checks if the default reserved characters, + ['}', '|', '_'], are used present in the lexicon. + """ + + validator = re.compile (ur"[\}\|_]") + + with open (self.lexicon_file, "r") as ifp : + for line in ifp : + if validator.search (line.decode ("utf8")) : + error = "Bad line contains reservered character:\n\t{0}" + error = error.format (line) + raise ValueError, error + + return + + def checkPhonetisaurusConfig (self) : + """Run some basic checks before training. + + Run some basic checks regarding the $PATH, environment, + and provided data before starting training. + + Raises: + EnvironmentError: raised if binaries are not found. + """ + + self.logger.info ("Checking command configuration...") + for program in ["phonetisaurus-g2pfst", + "phonetisaurus-align", + "phonetisaurus-arpa2wfst"] : + if not self.which (program) : + raise EnvironmentError, "Phonetisaurus command, '{0}', "\ + "not found in path.".format (program) + + if not os.path.isdir (self.dir_prefix) : + self.logger.debug ("Directory does not exist. Trying to create.") + os.makedirs (self.dir_prefix) + + self.logger.info ( + "Checking lexicon for reserved characters: '}', '|', '_'..." + ) + self.validateLexicon () + + path_prefix = os.path.join (self.dir_prefix, self.model_prefix) + + self.corpus_path = u"{0}.corpus".format (path_prefix) + self.arpa_path = u"{0}.o{1}.arpa".format (path_prefix, self.ngram_order) + self.model_path = u"{0}.fst".format (path_prefix) + + for key,val in sorted (vars (self).iteritems ()) : + self.logger.debug (u"{0}: {1}".format (key, val)) + + return + + def which (self, program) : + """Basic 'which' implementation for python. + + Basic 'which' implementation for python from stackoverflow: + * https://stackoverflow.com/a/377028/6739158 + """ + + def is_exe (fpath) : + return os.path.isfile (fpath) and os.access (fpath, os.X_OK) + + fpath, fname = os.path.split (program) + if fpath: + if is_exe (program): + return program + else: + for path in os.environ["PATH"].split (os.pathsep) : + path = path.strip ('"') + exe_file = os.path.join (path, program) + if is_exe (exe_file): + return exe_file + + return None + + def _setLMCommand (self, lm) : + """Configure the LM training command. + + Configure the LM training command according to the LM toolkit + selected by the user. Currently only mitlm is supported. + + Args: + lm (str): The selected command type: 'mitlm'. + + Returns: + function: The command building function for the selected toolkit. + """ + if lm == "mitlm" : + if self.which ("estimate-ngram") == None : + raise EnvironmentError, "mitlm binary 'estimate-ngram' not "\ + "found in path." + return self._mitlm + else : + raise NotImplementedError, "Only mitlm is currently supported." + + + def _mitlm (self) : + """mitlm estimate-ngram joint ngram training command. + + Build the mitlm joint ngram training command using the + estimate-ngram utility and provided arguments. + + Returns: + list: The command in subprocess list format. + """ + + command = [ + "estimate-ngram", + "-o", str (self.ngram_order), + "-t", self.corpus_path, + "-wl", self.arpa_path + ] + + self.logger.debug (u" ".join (command)) + + return command + + def makeAlignerCommand (self) : + """Build the aligner command from the provided arguments. + + Build the aligner command from the provided arguments. + + Returns: + list: The command in subprocess list format. + """ + + command = [ + "phonetisaurus-align", + "--input={0}".format (self.lexicon_file), + "--ofile={0}".format (self.corpus_path), + "--seq1_del={0}".format (str (self.seq1_del).lower ()), + "--seq2_del={0}".format (str (self.seq2_del).lower ()), + "--seq1_max={0}".format (str (self.seq1_max)), + "--seq2_max={0}".format (str (self.seq2_max)) + ] + + self.logger.debug (u" ".join (command)) + + return command + + def makeARPAToWFSTCommand (self) : + """Build the ARPA to Fst conversion command. + + Build the ARPA to Fst conversion command from the provided arguments. + + Returns: + list: The command in subprocess list format. + """ + + command = [ + "phonetisaurus-arpa2wfst", + "--lm={0}".format (self.arpa_path), + "--ofile={0}".format (self.model_path) + ] + + self.logger.debug (u" ".join (command)) + + return command + + def AlignLexicon (self) : + """Align the provided input pronunciation lexicon. + + Align the provided input pronunciation lexicon according to the + provided parameters. + + Returns: + bool: True on success, False on failure. + """ + + aligner_command = self.makeAlignerCommand () + + self.logger.info ("Aligning lexicon...") + try : + if self.verbose : + subprocess.check_call (aligner_command) + else : + with open (os.devnull, "w") as devnull : + subprocess.check_call ( + aligner_command, + stderr=devnull, + stdout=devnull + ) + except subprocess.CalledProcessError : + self.logger.error ("Alignment failed. Exiting.") + sys.exit (1) + + return + + def TrainNGramModel (self) : + """Train the joint ngram model. + + Train the joint ngram model using the selected toolkit. + + Returns: + bool: True on success, False on failure. + """ + joint_ngram_command = self.makeJointNgramCommand () + + self.logger.info ("Training joint ngram model...") + try : + if self.verbose : + subprocess.check_call (joint_ngram_command) + else : + with open (os.devnull, "w") as devnull : + subprocess.check_call ( + joint_ngram_command, + stderr=devnull, + stdout=devnull + ) + except subprocess.CalledProcessError : + self.logger.error ("Ngram model estimation failed. Exiting.") + sys.exit (1) + + return + + def ConvertARPAToWFST (self) : + """Convert the ARPA format joint n-gram model to Fst format. + + Convert the ARPA format joint n-gram model to an equivalent Fst + compatible with ```phonetisaurus-g2pfst```. + + Returns: + bool: True on success, False on failure. + """ + + arpa_to_fst_command = self.makeARPAToWFSTCommand () + + self.logger.info ("Converting ARPA format joint n-gram " + "model to WFST format...") + try : + if self.verbose : + subprocess.check_call (arpa_to_fst_command) + else : + with open (os.devnull, "w") as devnull : + subprocess.check_call ( + arpa_to_fst_command, + stderr=devnull, + stdout=devnull + ) + except subprocess.CalledProcessError : + self.logger.error ("ARPA to WFST conversion failed. Exiting.") + sys.exit (1) + + return + + def TrainG2PModel (self) : + self.checkPhonetisaurusConfig () + + self.AlignLexicon () + self.TrainNGramModel () + self.ConvertARPAToWFST () + + self.logger.info ( + "G2P training succeeded: \033[92m{0}\033[0m"\ + .format (self.model_path) + ) + + return + +if __name__ == "__main__" : + import sys, argparse + + example = "{0} --lexicon cmud.dic --seq2_del".format (sys.argv [0]) + parser = argparse.ArgumentParser (description=example) + parser.add_argument ("--lexicon", "-l", help="Training lexicon to use.", + required=True) + parser.add_argument ("--dir_prefix", "-dp", help="Output directory prefix.", + default="train") + parser.add_argument ("--model_prefix", "-mp", help="Output model prefix.", + default="model") + parser.add_argument ("--ngram_order", "-o", help="Maximum ngram order " + "for joint ngram model.", type=int, default=8) + parser.add_argument ("--seq1_del", "-s1d", help="Allow alignment deletions " + "in sequence one (graphemes).", + default=False, action="store_true") + parser.add_argument ("--seq2_del", "-s2d", help="Allow alignment deletions " + "in sequence two (phonemes).", + default=False, action="store_true") + parser.add_argument ("--seq1_max", "-s1m", help="Maximum subsequence " + "length for graphemic alignment chunks.", + type=int, default=2) + parser.add_argument ("--seq2_max", "-s2m", help="Maximum subsequence " + "length for phonemic alignment chunks.", + type=int, default=2) + parser.add_argument ("--lm", "-lm", help="LM toolkit to use.", + default="mitlm") + parser.add_argument ("--verbose", "-v", help="Verbose mode.", + default=False, action="store_true") + args = parser.parse_args () + + trainer = G2PModelTrainer (args.lexicon, **args.__dict__) + trainer.TrainG2PModel ()