diff --git a/someweta/cli.py b/someweta/cli.py index ae7433f..b2c10f9 100644 --- a/someweta/cli.py +++ b/someweta/cli.py @@ -34,13 +34,14 @@ def arguments(): parser.add_argument("--w2v", type=argparse.FileType("r"), help="Word2Vec vectors; optional and only for training or cross-validation") parser.add_argument("--lexicon", type=os.path.abspath, help="Additional full-form lexicon; optional and only for training or cross-validation") parser.add_argument("--mapping", type=os.path.abspath, help="Additional mapping to coarser tagset; optional and only for tagging, evaluating or cross-validation") - parser.add_argument("--ignore-tag", help="Ignore this tag (useful for partial annotation); optional and only for training, evaluating or cross-validation") + parser.add_argument("--ignore-tag", type=str, help="Ignore this tag (useful for partial annotation); optional and only for training, evaluating or cross-validation") parser.add_argument("--prior", type=os.path.abspath, help="Prior weights, i.e. a model trained on another corpus; optional and only for training or cross-validation") parser.add_argument("-i", "--iterations", type=int, default=10, help="Only for training or cross-validation: Number of iterations; default: 10") parser.add_argument("-b", "--beam-size", type=int, default=5, help="Size of the search beam; default: 5") parser.add_argument("--parallel", type=int, default=1, metavar="N", help="Run N worker processes (up to the number of CPUs) to speed up tagging.") parser.add_argument("-x", "--xml", action="store_true", help="The input is an XML file. We assume that each tag is on a separate line. Otherwise the format is the same as for regular files with respect to tag and sentence delimiters.") - parser.add_argument("--sentence_tag", "--sentence-tag", type=str, help="Tag name for sentence boundaries (e.g. --sentence_tag s). Use this option, if input sentences are delimited by XML tags (e.g. ) instead of empty lines. Implies -x/--xml.") + parser.add_argument("--sentence-tag", "--sentence_tag", type=str, help="Tag name for sentence boundaries (e.g. --sentence-tag s). Use this option, if input sentences are delimited by XML tags (e.g. ) instead of empty lines. Implies -x/--xml.") + parser.add_argument("--use-nfkc", action="store_true", help="Convert input to NFKC before feeding it to the tagger. This only affects the internal representation of the data.") parser.add_argument("--progress", action="store_true", help="Show progress when tagging a file.") parser.add_argument("-v", "--version", action="version", version="SoMeWeTa %s" % __version__, help="Output version information and exit.") parser.add_argument("CORPUS", type=argparse.FileType("r", encoding="utf-8"), @@ -55,8 +56,8 @@ def arguments(): def evaluate_fold(args): - i, beam_size, iterations, lexicon, mapping, brown_clusters, word_to_vec, words, tags, lengths, sentence_ranges, div, mod = args - asptagger = ASPTagger(beam_size, iterations, lexicon, mapping, brown_clusters, word_to_vec) + i, beam_size, iterations, lexicon, mapping, brown_clusters, word_to_vec, ignore_tag, use_nfkc, words, tags, lengths, sentence_ranges, div, mod = args + asptagger = ASPTagger(beam_size, iterations, lexicon, mapping, brown_clusters, word_to_vec, ignore_tag, use_nfkc) test_ranges = sentence_ranges[i * div + min(i, mod):(i + 1) * div + min(i + 1, mod)] test_start = test_ranges[0][0] test_end = test_ranges[-1][0] + test_ranges[-1][1] @@ -195,7 +196,7 @@ def main(): word_to_vec = utils.read_word2vec_vectors(args.w2v) if args.sentence_tag is not None: args.xml = True - asptagger = ASPTagger(args.beam_size, args.iterations, lexicon, mapping, brown_clusters, word_to_vec, args.ignore_tag) + asptagger = ASPTagger(args.beam_size, args.iterations, lexicon, mapping, brown_clusters, word_to_vec, args.ignore_tag, args.use_nfkc) if args.prior and (args.train or args.crossvalidate): asptagger.load_prior_model(args.prior) if args.train: @@ -253,7 +254,7 @@ def main(): sentence_ranges = list(zip((a - b for a, b in zip(itertools.accumulate(lengths), lengths)), lengths)) div, mod = divmod(len(sentence_ranges), 10) with multiprocessing.Pool() as pool: - accs = pool.map(evaluate_fold, zip(range(10), itertools.repeat(args.beam_size), itertools.repeat(args.iterations), itertools.repeat(lexicon), itertools.repeat(mapping), itertools.repeat(brown_clusters), itertools.repeat(word_to_vec), itertools.repeat(words), itertools.repeat(tags), itertools.repeat(lengths), itertools.repeat(sentence_ranges), itertools.repeat(div), itertools.repeat(mod))) + accs = pool.map(evaluate_fold, zip(range(10), itertools.repeat(args.beam_size), itertools.repeat(args.iterations), itertools.repeat(lexicon), itertools.repeat(mapping), itertools.repeat(brown_clusters), itertools.repeat(word_to_vec), itertools.repeat(args.ignore_tag), itertools.repeat(args.use_nfkc), itertools.repeat(words), itertools.repeat(tags), itertools.repeat(lengths), itertools.repeat(sentence_ranges), itertools.repeat(div), itertools.repeat(mod))) accuracies, accuracies_iv, accuracies_oov, coarse_accuracies, coarse_accuracies_iv, coarse_accuracies_oov = zip(*accs) mean_accuracy = statistics.mean(accuracies) # 2.26 is the approximate value of the 97.5 percentile point diff --git a/someweta/tagger.py b/someweta/tagger.py index 9570575..0d25858 100644 --- a/someweta/tagger.py +++ b/someweta/tagger.py @@ -7,6 +7,7 @@ import json import math import sys +import unicodedata import numpy as np import regex as re @@ -19,8 +20,9 @@ class ASPTagger(AveragedStructuredPerceptron): perceptron. """ - def __init__(self, beam_size=5, iterations=10, lexicon=None, mapping=None, brown_clusters=None, word_to_vec=None, ignore_tag=None): + def __init__(self, beam_size=5, iterations=10, lexicon=None, mapping=None, brown_clusters=None, word_to_vec=None, ignore_tag=None, use_nfkc=False): super().__init__(beam_size=beam_size, beam_history=2, iterations=iterations, latent_features=None, ignore_target=ignore_tag) + self.use_nfkc = use_nfkc self.vocabulary = set() self.lexicon = lexicon self.mapping = mapping @@ -104,21 +106,29 @@ def __init__(self, beam_size=5, iterations=10, lexicon=None, mapping=None, brown def train(self, words, tags, lengths): """""" - lower_words = [w.lower() for w in words] + if self.use_nfkc: + feature_words = [unicodedata.normalize("NFKC", w) for w in words] + else: + feature_words = words + lower_words = [w.lower() for w in feature_words] self.latent_features = functools.partial(self._get_latent_features, lower_words) - self.vocabulary.update(set(words)) + self.vocabulary.update(set(feature_words)) # self.vocabulary.update(set(lower_words)) # # # vocabulary = all lower case word forms except hapax legomena # self.vocabulary.update(set(k for k, v in collections.Counter(lower_words).items() if v > 1)) # - X = self._get_static_features(words, lengths) + X = self._get_static_features(feature_words, lengths) self.fit(X, tags, lengths) def tag(self, words, lengths): """""" - self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in words]) - X = self._get_static_features(words, lengths) + if self.use_nfkc: + feature_words = [unicodedata.normalize("NFKC", w) for w in words] + else: + feature_words = words + self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in feature_words]) + X = self._get_static_features(feature_words, lengths) tags = self.predict(X, lengths) start = 0 for length, local_tags in zip(lengths, tags): @@ -132,8 +142,12 @@ def tag(self, words, lengths): def tag_sentence(self, sentence): """""" sentence_length = [len(sentence)] - self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in sentence]) - X = self._get_static_features(sentence, sentence_length) + if self.use_nfkc: + feature_words = [unicodedata.normalize("NFKC", w) for w in sentence] + else: + feature_words = sentence + self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in feature_words]) + X = self._get_static_features(feature_words, sentence_length) tags = list(self.predict(X, sentence_length))[0] if self.mapping is not None: return list(zip(sentence, tags, (self.mapping[lt] for lt in tags))) @@ -158,8 +172,12 @@ def tag_xml_sentence(self, sentence): def evaluate(self, words, tags, lengths): """""" - self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in words]) - X = self._get_static_features(words, lengths) + if self.use_nfkc: + feature_words = [unicodedata.normalize("NFKC", w) for w in words] + else: + feature_words = words + self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in feature_words]) + X = self._get_static_features(feature_words, lengths) # accuracy = self.score(X, tags, lengths) # return accuracy predicted = self.predict(X, lengths)