Skip to content

Commit

Permalink
Add option to convert input data to NFKC internally (issue #11)
Browse files Browse the repository at this point in the history
  • Loading branch information
tsproisl committed Aug 2, 2021
1 parent 17e3d84 commit a447b9f
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 16 deletions.
13 changes: 7 additions & 6 deletions someweta/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,14 @@ def arguments():
parser.add_argument("--w2v", type=argparse.FileType("r"), help="Word2Vec vectors; optional and only for training or cross-validation")
parser.add_argument("--lexicon", type=os.path.abspath, help="Additional full-form lexicon; optional and only for training or cross-validation")
parser.add_argument("--mapping", type=os.path.abspath, help="Additional mapping to coarser tagset; optional and only for tagging, evaluating or cross-validation")
parser.add_argument("--ignore-tag", help="Ignore this tag (useful for partial annotation); optional and only for training, evaluating or cross-validation")
parser.add_argument("--ignore-tag", type=str, help="Ignore this tag (useful for partial annotation); optional and only for training, evaluating or cross-validation")
parser.add_argument("--prior", type=os.path.abspath, help="Prior weights, i.e. a model trained on another corpus; optional and only for training or cross-validation")
parser.add_argument("-i", "--iterations", type=int, default=10, help="Only for training or cross-validation: Number of iterations; default: 10")
parser.add_argument("-b", "--beam-size", type=int, default=5, help="Size of the search beam; default: 5")
parser.add_argument("--parallel", type=int, default=1, metavar="N", help="Run N worker processes (up to the number of CPUs) to speed up tagging.")
parser.add_argument("-x", "--xml", action="store_true", help="The input is an XML file. We assume that each tag is on a separate line. Otherwise the format is the same as for regular files with respect to tag and sentence delimiters.")
parser.add_argument("--sentence_tag", "--sentence-tag", type=str, help="Tag name for sentence boundaries (e.g. --sentence_tag s). Use this option, if input sentences are delimited by XML tags (e.g. <s>…</s>) instead of empty lines. Implies -x/--xml.")
parser.add_argument("--sentence-tag", "--sentence_tag", type=str, help="Tag name for sentence boundaries (e.g. --sentence-tag s). Use this option, if input sentences are delimited by XML tags (e.g. <s>…</s>) instead of empty lines. Implies -x/--xml.")
parser.add_argument("--use-nfkc", action="store_true", help="Convert input to NFKC before feeding it to the tagger. This only affects the internal representation of the data.")
parser.add_argument("--progress", action="store_true", help="Show progress when tagging a file.")
parser.add_argument("-v", "--version", action="version", version="SoMeWeTa %s" % __version__, help="Output version information and exit.")
parser.add_argument("CORPUS", type=argparse.FileType("r", encoding="utf-8"),
Expand All @@ -55,8 +56,8 @@ def arguments():


def evaluate_fold(args):
i, beam_size, iterations, lexicon, mapping, brown_clusters, word_to_vec, words, tags, lengths, sentence_ranges, div, mod = args
asptagger = ASPTagger(beam_size, iterations, lexicon, mapping, brown_clusters, word_to_vec)
i, beam_size, iterations, lexicon, mapping, brown_clusters, word_to_vec, ignore_tag, use_nfkc, words, tags, lengths, sentence_ranges, div, mod = args
asptagger = ASPTagger(beam_size, iterations, lexicon, mapping, brown_clusters, word_to_vec, ignore_tag, use_nfkc)
test_ranges = sentence_ranges[i * div + min(i, mod):(i + 1) * div + min(i + 1, mod)]
test_start = test_ranges[0][0]
test_end = test_ranges[-1][0] + test_ranges[-1][1]
Expand Down Expand Up @@ -195,7 +196,7 @@ def main():
word_to_vec = utils.read_word2vec_vectors(args.w2v)
if args.sentence_tag is not None:
args.xml = True
asptagger = ASPTagger(args.beam_size, args.iterations, lexicon, mapping, brown_clusters, word_to_vec, args.ignore_tag)
asptagger = ASPTagger(args.beam_size, args.iterations, lexicon, mapping, brown_clusters, word_to_vec, args.ignore_tag, args.use_nfkc)
if args.prior and (args.train or args.crossvalidate):
asptagger.load_prior_model(args.prior)
if args.train:
Expand Down Expand Up @@ -253,7 +254,7 @@ def main():
sentence_ranges = list(zip((a - b for a, b in zip(itertools.accumulate(lengths), lengths)), lengths))
div, mod = divmod(len(sentence_ranges), 10)
with multiprocessing.Pool() as pool:
accs = pool.map(evaluate_fold, zip(range(10), itertools.repeat(args.beam_size), itertools.repeat(args.iterations), itertools.repeat(lexicon), itertools.repeat(mapping), itertools.repeat(brown_clusters), itertools.repeat(word_to_vec), itertools.repeat(words), itertools.repeat(tags), itertools.repeat(lengths), itertools.repeat(sentence_ranges), itertools.repeat(div), itertools.repeat(mod)))
accs = pool.map(evaluate_fold, zip(range(10), itertools.repeat(args.beam_size), itertools.repeat(args.iterations), itertools.repeat(lexicon), itertools.repeat(mapping), itertools.repeat(brown_clusters), itertools.repeat(word_to_vec), itertools.repeat(args.ignore_tag), itertools.repeat(args.use_nfkc), itertools.repeat(words), itertools.repeat(tags), itertools.repeat(lengths), itertools.repeat(sentence_ranges), itertools.repeat(div), itertools.repeat(mod)))
accuracies, accuracies_iv, accuracies_oov, coarse_accuracies, coarse_accuracies_iv, coarse_accuracies_oov = zip(*accs)
mean_accuracy = statistics.mean(accuracies)
# 2.26 is the approximate value of the 97.5 percentile point
Expand Down
38 changes: 28 additions & 10 deletions someweta/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import json
import math
import sys
import unicodedata

import numpy as np
import regex as re
Expand All @@ -19,8 +20,9 @@ class ASPTagger(AveragedStructuredPerceptron):
perceptron.
"""
def __init__(self, beam_size=5, iterations=10, lexicon=None, mapping=None, brown_clusters=None, word_to_vec=None, ignore_tag=None):
def __init__(self, beam_size=5, iterations=10, lexicon=None, mapping=None, brown_clusters=None, word_to_vec=None, ignore_tag=None, use_nfkc=False):
super().__init__(beam_size=beam_size, beam_history=2, iterations=iterations, latent_features=None, ignore_target=ignore_tag)
self.use_nfkc = use_nfkc
self.vocabulary = set()
self.lexicon = lexicon
self.mapping = mapping
Expand Down Expand Up @@ -104,21 +106,29 @@ def __init__(self, beam_size=5, iterations=10, lexicon=None, mapping=None, brown

def train(self, words, tags, lengths):
""""""
lower_words = [w.lower() for w in words]
if self.use_nfkc:
feature_words = [unicodedata.normalize("NFKC", w) for w in words]
else:
feature_words = words
lower_words = [w.lower() for w in feature_words]
self.latent_features = functools.partial(self._get_latent_features, lower_words)
self.vocabulary.update(set(words))
self.vocabulary.update(set(feature_words))
# self.vocabulary.update(set(lower_words))
# <OOV>
# # vocabulary = all lower case word forms except hapax legomena
# self.vocabulary.update(set(k for k, v in collections.Counter(lower_words).items() if v > 1))
# </OOV>
X = self._get_static_features(words, lengths)
X = self._get_static_features(feature_words, lengths)
self.fit(X, tags, lengths)

def tag(self, words, lengths):
""""""
self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in words])
X = self._get_static_features(words, lengths)
if self.use_nfkc:
feature_words = [unicodedata.normalize("NFKC", w) for w in words]
else:
feature_words = words
self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in feature_words])
X = self._get_static_features(feature_words, lengths)
tags = self.predict(X, lengths)
start = 0
for length, local_tags in zip(lengths, tags):
Expand All @@ -132,8 +142,12 @@ def tag(self, words, lengths):
def tag_sentence(self, sentence):
""""""
sentence_length = [len(sentence)]
self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in sentence])
X = self._get_static_features(sentence, sentence_length)
if self.use_nfkc:
feature_words = [unicodedata.normalize("NFKC", w) for w in sentence]
else:
feature_words = sentence
self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in feature_words])
X = self._get_static_features(feature_words, sentence_length)
tags = list(self.predict(X, sentence_length))[0]
if self.mapping is not None:
return list(zip(sentence, tags, (self.mapping[lt] for lt in tags)))
Expand All @@ -158,8 +172,12 @@ def tag_xml_sentence(self, sentence):

def evaluate(self, words, tags, lengths):
""""""
self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in words])
X = self._get_static_features(words, lengths)
if self.use_nfkc:
feature_words = [unicodedata.normalize("NFKC", w) for w in words]
else:
feature_words = words
self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in feature_words])
X = self._get_static_features(feature_words, lengths)
# accuracy = self.score(X, tags, lengths)
# return accuracy
predicted = self.predict(X, lengths)
Expand Down

0 comments on commit a447b9f

Please sign in to comment.