Add option to convert input data to NFKC internally (issue #11)

tsproisl · Aug 2, 2021 · a447b9f · a447b9f
1 parent 17e3d84
commit a447b9f
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 16 deletions.
diff --git a/someweta/cli.py b/someweta/cli.py
@@ -34,13 +34,14 @@ def arguments():
     parser.add_argument("--w2v", type=argparse.FileType("r"), help="Word2Vec vectors; optional and only for training or cross-validation")
     parser.add_argument("--lexicon", type=os.path.abspath, help="Additional full-form lexicon; optional and only for training or cross-validation")
     parser.add_argument("--mapping", type=os.path.abspath, help="Additional mapping to coarser tagset; optional and only for tagging, evaluating or cross-validation")
-    parser.add_argument("--ignore-tag", help="Ignore this tag (useful for partial annotation); optional and only for training, evaluating or cross-validation")
+    parser.add_argument("--ignore-tag", type=str, help="Ignore this tag (useful for partial annotation); optional and only for training, evaluating or cross-validation")
     parser.add_argument("--prior", type=os.path.abspath, help="Prior weights, i.e. a model trained on another corpus; optional and only for training or cross-validation")
     parser.add_argument("-i", "--iterations", type=int, default=10, help="Only for training or cross-validation: Number of iterations; default: 10")
     parser.add_argument("-b", "--beam-size", type=int, default=5, help="Size of the search beam; default: 5")
     parser.add_argument("--parallel", type=int, default=1, metavar="N", help="Run N worker processes (up to the number of CPUs) to speed up tagging.")
     parser.add_argument("-x", "--xml", action="store_true", help="The input is an XML file. We assume that each tag is on a separate line. Otherwise the format is the same as for regular files with respect to tag and sentence delimiters.")
-    parser.add_argument("--sentence_tag", "--sentence-tag", type=str, help="Tag name for sentence boundaries (e.g. --sentence_tag s). Use this option, if input sentences are delimited by XML tags (e.g. <s>…</s>) instead of empty lines. Implies -x/--xml.")
+    parser.add_argument("--sentence-tag", "--sentence_tag", type=str, help="Tag name for sentence boundaries (e.g. --sentence-tag s). Use this option, if input sentences are delimited by XML tags (e.g. <s>…</s>) instead of empty lines. Implies -x/--xml.")
+    parser.add_argument("--use-nfkc", action="store_true", help="Convert input to NFKC before feeding it to the tagger. This only affects the internal representation of the data.")
     parser.add_argument("--progress", action="store_true", help="Show progress when tagging a file.")
     parser.add_argument("-v", "--version", action="version", version="SoMeWeTa %s" % __version__, help="Output version information and exit.")
     parser.add_argument("CORPUS", type=argparse.FileType("r", encoding="utf-8"),
@@ -55,8 +56,8 @@ def arguments():
 
 
 def evaluate_fold(args):
-    i, beam_size, iterations, lexicon, mapping, brown_clusters, word_to_vec, words, tags, lengths, sentence_ranges, div, mod = args
-    asptagger = ASPTagger(beam_size, iterations, lexicon, mapping, brown_clusters, word_to_vec)
+    i, beam_size, iterations, lexicon, mapping, brown_clusters, word_to_vec, ignore_tag, use_nfkc, words, tags, lengths, sentence_ranges, div, mod = args
+    asptagger = ASPTagger(beam_size, iterations, lexicon, mapping, brown_clusters, word_to_vec, ignore_tag, use_nfkc)
     test_ranges = sentence_ranges[i * div + min(i, mod):(i + 1) * div + min(i + 1, mod)]
     test_start = test_ranges[0][0]
     test_end = test_ranges[-1][0] + test_ranges[-1][1]
@@ -195,7 +196,7 @@ def main():
         word_to_vec = utils.read_word2vec_vectors(args.w2v)
     if args.sentence_tag is not None:
         args.xml = True
-    asptagger = ASPTagger(args.beam_size, args.iterations, lexicon, mapping, brown_clusters, word_to_vec, args.ignore_tag)
+    asptagger = ASPTagger(args.beam_size, args.iterations, lexicon, mapping, brown_clusters, word_to_vec, args.ignore_tag, args.use_nfkc)
     if args.prior and (args.train or args.crossvalidate):
         asptagger.load_prior_model(args.prior)
     if args.train:
@@ -253,7 +254,7 @@ def main():
         sentence_ranges = list(zip((a - b for a, b in zip(itertools.accumulate(lengths), lengths)), lengths))
         div, mod = divmod(len(sentence_ranges), 10)
         with multiprocessing.Pool() as pool:
-            accs = pool.map(evaluate_fold, zip(range(10), itertools.repeat(args.beam_size), itertools.repeat(args.iterations), itertools.repeat(lexicon), itertools.repeat(mapping), itertools.repeat(brown_clusters), itertools.repeat(word_to_vec), itertools.repeat(words), itertools.repeat(tags), itertools.repeat(lengths), itertools.repeat(sentence_ranges), itertools.repeat(div), itertools.repeat(mod)))
+            accs = pool.map(evaluate_fold, zip(range(10), itertools.repeat(args.beam_size), itertools.repeat(args.iterations), itertools.repeat(lexicon), itertools.repeat(mapping), itertools.repeat(brown_clusters), itertools.repeat(word_to_vec), itertools.repeat(args.ignore_tag), itertools.repeat(args.use_nfkc), itertools.repeat(words), itertools.repeat(tags), itertools.repeat(lengths), itertools.repeat(sentence_ranges), itertools.repeat(div), itertools.repeat(mod)))
         accuracies, accuracies_iv, accuracies_oov, coarse_accuracies, coarse_accuracies_iv, coarse_accuracies_oov = zip(*accs)
         mean_accuracy = statistics.mean(accuracies)
         # 2.26 is the approximate value of the 97.5 percentile point

diff --git a/someweta/tagger.py b/someweta/tagger.py
@@ -7,6 +7,7 @@
 import json
 import math
 import sys
+import unicodedata
 
 import numpy as np
 import regex as re
@@ -19,8 +20,9 @@ class ASPTagger(AveragedStructuredPerceptron):
     perceptron.
 
     """
-    def __init__(self, beam_size=5, iterations=10, lexicon=None, mapping=None, brown_clusters=None, word_to_vec=None, ignore_tag=None):
+    def __init__(self, beam_size=5, iterations=10, lexicon=None, mapping=None, brown_clusters=None, word_to_vec=None, ignore_tag=None, use_nfkc=False):
         super().__init__(beam_size=beam_size, beam_history=2, iterations=iterations, latent_features=None, ignore_target=ignore_tag)
+        self.use_nfkc = use_nfkc
         self.vocabulary = set()
         self.lexicon = lexicon
         self.mapping = mapping
@@ -104,21 +106,29 @@ def __init__(self, beam_size=5, iterations=10, lexicon=None, mapping=None, brown
 
     def train(self, words, tags, lengths):
         """"""
-        lower_words = [w.lower() for w in words]
+        if self.use_nfkc:
+            feature_words = [unicodedata.normalize("NFKC", w) for w in words]
+        else:
+            feature_words = words
+        lower_words = [w.lower() for w in feature_words]
         self.latent_features = functools.partial(self._get_latent_features, lower_words)
-        self.vocabulary.update(set(words))
+        self.vocabulary.update(set(feature_words))
         # self.vocabulary.update(set(lower_words))
         # <OOV>
         # # vocabulary = all lower case word forms except hapax legomena
         # self.vocabulary.update(set(k for k, v in collections.Counter(lower_words).items() if v > 1))
         # </OOV>
-        X = self._get_static_features(words, lengths)
+        X = self._get_static_features(feature_words, lengths)
         self.fit(X, tags, lengths)
 
     def tag(self, words, lengths):
         """"""
-        self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in words])
-        X = self._get_static_features(words, lengths)
+        if self.use_nfkc:
+            feature_words = [unicodedata.normalize("NFKC", w) for w in words]
+        else:
+            feature_words = words
+        self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in feature_words])
+        X = self._get_static_features(feature_words, lengths)
         tags = self.predict(X, lengths)
         start = 0
         for length, local_tags in zip(lengths, tags):
@@ -132,8 +142,12 @@ def tag(self, words, lengths):
     def tag_sentence(self, sentence):
         """"""
         sentence_length = [len(sentence)]
-        self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in sentence])
-        X = self._get_static_features(sentence, sentence_length)
+        if self.use_nfkc:
+            feature_words = [unicodedata.normalize("NFKC", w) for w in sentence]
+        else:
+            feature_words = sentence
+        self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in feature_words])
+        X = self._get_static_features(feature_words, sentence_length)
         tags = list(self.predict(X, sentence_length))[0]
         if self.mapping is not None:
             return list(zip(sentence, tags, (self.mapping[lt] for lt in tags)))
@@ -158,8 +172,12 @@ def tag_xml_sentence(self, sentence):
 
     def evaluate(self, words, tags, lengths):
         """"""
-        self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in words])
-        X = self._get_static_features(words, lengths)
+        if self.use_nfkc:
+            feature_words = [unicodedata.normalize("NFKC", w) for w in words]
+        else:
+            feature_words = words
+        self.latent_features = functools.partial(self._get_latent_features, [w.lower() for w in feature_words])
+        X = self._get_static_features(feature_words, lengths)
         # accuracy = self.score(X, tags, lengths)
         # return accuracy
         predicted = self.predict(X, lengths)