-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtpo_text_processing.py
567 lines (494 loc) · 22.2 KB
/
tpo_text_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
#! /usr/bin/env python
#
# text_processing.py: performs text processing (e.g., via NLTK), mainly for
# word tokenization and part-of-speech tagging. To make the part-of-speech
# output easier to digrest, there is an envionment option USE_CLASS_TAGS
# to enable use of class-based tags, such as those used in traditional
# grammar (e.g., noun for NNS).
#
#
# Notes:
# - function resulting caching ("memoization") is used via memodict decoration
# - environment variables (env-var) are used for some adhoc options
# - to bypass NLTK (e.g., for quick debugging), set SKIP_NLTK env-var to 0
#
#------------------------------------------------------------------------
# Miscelleneous notes
#
# Penn Tags (used by NLTK part-of-speech tagger)
#
# CC Coordinating conjunction
# CD Cardinal number
# DT Determiner
# EX Existential there
# FW Foreign word
# IN Preposition or subordinating conjunction
# JJ Adjective
# JJR Adjective, comparative
# JJS Adjective, superlative
# LS List item marker
# MD Modal
# NN Noun, singular or mass
# NNS Noun, plural
# NNP Proper noun, singular
# NNPS Proper noun, plural
# PDT Predeterminer
# POS Possessive ending
# PP Personal pronoun
# PP$ Possessive pronoun ???
# PRP$ Possessive pronoun
# PRP Personal pronoun
# RB Adverb
# RBR Adverb, comparative
# RBS Adverb, superlative
# RP Particle
# SYM Symbol
# TO to
# UH Interjection
# VB Verb, base form
# VBD Verb, past tense
# VBG Verb, gerund or present participle
# VBN Verb, past participle
# VBP Verb, non-3rd person singular present
# VBZ Verb, 3rd person singular present
# WDT Wh-determiner
# WP Wh-pronoun
# WP$ Possessive wh-pronoun
# WRB Wh-adverb
#
# See ftp://ftp.cis.upenn.edu/pub/treebank/doc/tagguide.ps.gz for more details.
#------------------------------------------------------------------------
# TODO:
# - Integrate punctuation isolation (see prep_brill.perl).
# - Add usage examples (e.g., part-of-speech tagging).
# - Warn that without NLTK or Enchant, the text processing just checks common cases.
#
"""Performs text processing (e.g., via tokenization via NLTK)"""
#------------------------------------------------------------------------
# Library packages
import tpo_common as tpo
import glue_helpers as gh
import sys # system interface (e.g., command line)
import re # regular expressions
#------------------------------------------------------------------------
# Globals
# Object for spell checking via enchant
speller = None
word_freq_hash = None
WORD_FREQ_FILE = tpo.getenv_text("WORD_FREQ_FILE", "word.freq")
# Hash for returning most common part of speech for a word (or token)
word_POS_hash = None
WORD_POS_FREQ_FILE = tpo.getenv_text("WORD_POS_FREQ_FILE", "word-POS.freq")
# Misc. options
LINE_MODE = tpo.getenv_boolean("LINE_MODE", False, "Process text line by line (not all all once)")
JUST_TAGS = tpo.getenv_boolean("JUST_TAGS", False, "Just show part of speech tags (not word/POS pair)")
OUTPUT_DELIM = tpo.getenv_text("OUTPUT_DELIM", " ", "Delimiter (separator) for output lists")
INCLUDE_MISSPELLINGS = tpo.getenv_boolean("INCLUDE_MISSPELLINGS", False, "Include spell checking")
SKIP_MISSPELLINGS = tpo.getenv_boolean("SKIP_MISSPELLINGS", not INCLUDE_MISSPELLINGS, "Skip spell checking")
SHOW_MISSPELLINGS = not SKIP_MISSPELLINGS
USE_CLASS_TAGS = tpo.getenv_boolean("USE_CLASS_TAGS", False, "Use class-based tags (e.g., Noun for NNP)")
SKIP_CLASS_TAGS = not USE_CLASS_TAGS
KEEP_PUNCT = tpo.getenv_boolean("KEEP_PUNCT", False, "Use punctuation symbol as part-of-speech label")
TERSE_OUTPUT = tpo.getenv_boolean("TERSE_OUTPUT", JUST_TAGS, "Terse output mode")
VERBOSE = not TERSE_OUTPUT
# Skip use of NLTK and/or ENCHANT packages (using simple versions of functions)
# TODO: make misspellings optional (add --classic mode???)
SKIP_NLTK = tpo.getenv_boolean("SKIP_NLTK", False, "Omit usage of Natural Language Toolkit (NLTK)")
SKIP_ENCHANT = tpo.getenv_boolean("SKIP_ENCHANT", SKIP_MISSPELLINGS, "Omit usage of Enchant package for spell checking")
# List of stopwords (e.g., high-freqency function words)
stopwords = None
#------------------------------------------------------------------------
# Optional libraries
if not SKIP_NLTK:
import nltk # NLP toolkit
if not SKIP_ENCHANT:
import enchant # spell checking
#------------------------------------------------------------------------
# Functions
def split_sentences(text):
"""Splits TEXT into sentences"""
# EX: split_sentences("I came. I saw. I conquered!") => ["I came.", "I saw.", "I conquered!"]
# EX: split_sentences("Dr. Watson, it's elementary. But why?") => ["Dr. Watson, it's elementary." "But why?"]
if SKIP_NLTK:
# Split around sentence-ending punctuation followed by space,
# but excluding initials (TODO handle abbreviations (e.g., "mo.")
#
# TEST: Replace tabs with space and newlines with two spaces
## text = re.sub(r"\t", " ", text)
## text = re.sub(r"\n", " ", text)
#
# Make sure ending punctuaion followed by two spaces and preceded by one
text = re.sub(r"([\.\!\?])\s", r" \1 ", text)
#
# Remove spacing added above after likely abbreviations
text = re.sub(r"\b([A-Z][a-z]*)\s\.\s\s", r"\1. ", text)
#
# Split sentences by ending punctuation followed by two spaces
# Note: uses a "positive lookbehind assertion" (i.e., (?<=...) to retain punctuation
sentences = re.split(r"(?<=[\.\!\?])\s\s+", text.strip())
else:
sentences = nltk.tokenize.sent_tokenize(text)
return sentences
def split_word_tokens(text):
"""Splits TEXT into word tokens (i.e., words, punctuation, etc.) Note: run split_sentences first (e.g., to allow for proper handling of periods)."""
# EX: split_word_tokens("How now, brown cow?") => ['How', 'now', ',', 'brown', 'cow', '?']
tpo.debug_print("split_word_tokens(%s); type=%s" % (text, type(text)), 7)
if SKIP_NLTK:
tokens = [t.strip() for t in re.split(r"(\W+)", text) if (len(t.strip()) > 0)]
else:
tokens = nltk.word_tokenize(text)
tpo.debug_print("tokens: %s" % [(t, type(t)) for t in tokens], 7)
return tokens
def label_for_tag(POS, word=None):
"""Returns part-of-speech label for POS, optionally overriden based on WORD (e.g., original token if punctuation)"""
label = POS
if KEEP_PUNCT and word and re.match(r"\W", word[0]):
label = word
tpo.debug_format("label_for_tag({t}, {w}) => {l}", 5, t=POS, w=word, l=label)
return label
fallback_POS_class = { # classes for parts-of-speech not covered by rules (see class_for_tag)
"CC": "conjunction", # Coordinating conjunction
"CD": "number", # Cardinal number
"DT": "determiner", # Determiner
"EX": "pronoun", # Existential there
"FW": "noun", # Foreign word
"IN": "preposition", # Preposition or subordinating conjunction
"JJ": "adjective", # Adjective
"JJR": "adjective", # Adjective, comparative
"JJS": "adjective", # Adjective, superlative
"LS": "punctuation", # List item marker
"MD": "auxiliary", # Modal
"PDT": "determiner", # Predeterminer
"POS": "punctuation", # Possessive ending
"SYM": "punctuation", # Symbol
"TO": "preposition", # to
"UH": "punctuation", # Interjection
"WDT": "determiner", # Wh-determiner
"WP": "pronoun", # Wh-pronoun
"WP$": "pronoun", # Possessive wh-pronoun
"WRB": "adverb", # Wh-adverb
}
def class_for_tag(POS, word=None, previous=None):
"""Returns class label for POS tag, optionally over WORD and for PREVIOUS tag. Note: most cases are resolved without previous tag for context, except for gerunds and past participles which are considered nouns unless previous is auxiliary. Similarly, the word is only considered for special cases like punctuation."""
# EX: class_for_tag("NNS") => "noun"
# EX: class_for_tag("VBG") => "noun"
# EX: class_for_tag("VBG", previous="MD") => "verb"
# EX: class_for_tag("NNP", word="(") => "punctuation"
tag_class = "unknown"
if (POS == "VBG") and previous and ((previous[:2] == "VB") or (previous == "MD")):
tag_class = "verb"
elif (POS == "VBP") and previous and ((previous[:2] == "VB") or (previous == "MD")):
tag_class = "verb"
elif word and re.match(r"\W", word[0]):
tag_class = "punctuation"
# TODO: use prefix-based lookup table for rest (e.g., {"NN": "noun", ...})?
elif POS[:2] == "NN":
tag_class = "noun"
elif POS[:2] == "JJ":
tag_class = "adjective"
elif POS[:2] == "VB":
tag_class = "verb"
elif POS[:2] == "RB":
tag_class = "adverb"
elif (POS[:2] == "PR") or (POS[:2] == "PP"):
tag_class = "pronoun"
else:
tag_class = fallback_POS_class.get(POS, tag_class)
tpo.debug_format("class_for_tag({t}, {p}) => {c}", 5, t=POS, p=previous, c=tag_class)
return tag_class
def tag_part_of_speech(tokens):
"""Return list of part-of-speech taggings of form (token, tag) for list of TOKENS"""
# EX: tag_part_of_speech(['How', 'now', ',', 'brown', 'cow', '?']) => [('How', 'WRB'), ('now', 'RB'), (',', ','), ('brown', 'JJ'), ('cow', 'NN'), ('?', '.')]
if SKIP_NLTK:
part_of_speech_taggings = [(word, get_most_common_POS(word)) for word in tokens]
else:
part_of_speech_taggings = []
previous = None
raw_part_of_speech_taggings = nltk.pos_tag(tokens)
tpo.debug_print("raw tags: %s" % [t for (_w, t) in raw_part_of_speech_taggings], 3)
for (word, POS) in raw_part_of_speech_taggings:
tag = label_for_tag(POS, word) if SKIP_CLASS_TAGS else class_for_tag(POS, word, previous)
part_of_speech_taggings.append((word, tag))
previous = POS
tpo.debug_format("tag_part_of_speech({tokens}) => {tags}", 6,
tokens=tokens, tags=part_of_speech_taggings)
return part_of_speech_taggings
def tokenize_and_tag(text):
"""Run sentence(s) and word tokenization over text and then part-of-speech tag it"""
# TODO: return one list per sentence not just one combined list
tpo.debug_print("tokenize_and_tag(%s)" % text, 7)
text = tpo.ensure_unicode(text)
text_taggings = []
for sentence in split_sentences(text):
tpo.debug_print("sentence: %s" % sentence.strip(), 3)
tokens = split_word_tokens(sentence)
tpo.debug_print("tokens: %s" % tokens, 3)
taggings = tag_part_of_speech(tokens)
tpo.debug_print("taggings: %s" % taggings, 3)
# TODO: use append
text_taggings += taggings
return text_taggings
def tokenize_text(text):
"""Run sentence and word tokenization over text returning a list of sentence word lists"""
tokenized_lines = []
for sentence in split_sentences(text):
sent_tokens = split_word_tokens(sentence)
tpo.debug_print("sent_tokens: %s" % sent_tokens, 3)
tokenized_lines.append(sent_tokens)
return tokenized_lines
@tpo.memodict
def is_stopword(word):
"""Indicates whether WORD should generally be excluded from analysis (e.g., function word)"""
# note: Intended as a quick filter for excluding non-content words.
global stopwords
if (stopwords is None):
if SKIP_NLTK:
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']
else:
stopwords = nltk.corpus.stopwords.words('english')
tpo.debug_print("stopwords: %s" % stopwords, 4)
return (word.lower() in stopwords)
@tpo.memodict
def has_spelling_mistake(term):
"""Indicates whether TERM represents a spelling mistake"""
# TODO: rework in terms of a class-based interface
has_mistake = False
try:
if SKIP_ENCHANT:
global word_freq_hash
if not word_freq_hash:
word_freq_path = gh.resolve_path(WORD_FREQ_FILE)
gh.assertion(gh.non_empty_file(word_freq_path))
word_freq_hash = read_freq_data(word_freq_path)
has_mistake = term.lower() not in word_freq_hash
else:
global speller
if not speller:
speller = enchant.Dict("en_US")
has_mistake = not speller.check(term)
except:
tpo.debug_print("Warning: exception during spell checking of '%s': %s" % (term, str(sys.exc_info())))
return has_mistake
# read_freq_data(filename): Reads frequency listing for words (or other keys).
# A hash table is returned from (lowercased) key to their frequency.
#
# Sample input:
#
# # Word Freq
# the 179062
# to 123567
# is 99390
# and 95920
# a 76679
#
def read_freq_data(filename):
tpo.debug_print("read_freq_data(%s)" % filename, 3)
freq_hash = {}
# Process each line of the file
input_handle = open(filename,"r")
line_num = 0
for line in input_handle:
line_num += 1
# Ignore comments
line = line.strip()
if (len(line) > 0) and (line[0] == '#'):
continue
# Extract the four fields and warn if not defined
fields = line.split("\t")
if (len(fields) != 2):
tpo.debug_print("Ignoring line %d of %s: %s" % (line_num, filename, line), 3)
continue
(key, freq) = fields
key = key.strip().lower()
# Store in hash
if key not in freq_hash:
freq_hash[key] = freq
else:
tpo.debug_print("Ignoring alternative freq for key %s: %s (using %s)"
% (key, freq, freq_hash[key]), 6)
return (freq_hash)
# read_word_POS_data(filename): Reads frequency listing for words in particular parts of speech
# to derive dictionay of the most common part-of-speech for words (for quick-n-dirty part-of-speech
# tagging). A hash table is returned from (lowercased) words to their most common part-of-speech.
#
# Sample input:
#
# # Token POS Freq
# , , 379752
# . . 372550
# the DT 158317
# to TO 122189
#
def read_word_POS_data(filename):
tpo.debug_print("read_word_POS_freq(%s)" % filename, 3)
word_POS_hash = {}
# Process each line of the file
input_handle = open(filename,"r")
line_num = 0
for line in input_handle:
line_num += 1
# Ignore comments
line = line.strip()
if (len(line) > 0) and (line[0] == '#'):
continue
# Extract the four fields and warn if not defined
fields = line.split("\t")
if (len(fields) != 3):
tpo.debug_print("Ignoring line %d of %s: %s" % (line_num, filename, line), 3)
continue
(word, POS, freq) = fields
word = word.strip().lower()
# Store in hash
if word not in word_POS_hash:
word_POS_hash[word] = POS
else:
tpo.debug_print("Ignoring alternative POS for word %s: %s (using %s)"
% (word, POS, word_POS_hash[word]), 6)
return (word_POS_hash)
def get_most_common_POS(word):
"""Returns the most common part-of-speech label for WORD, defaulting to NN (noun)"""
# EX: get_most_common_POS("can") => "MD"
# EX: get_most_common_POS("notaword") => "NN"
global word_POS_hash
if not word_POS_hash:
word_POS_freq_path = gh.resolve_path(WORD_POS_FREQ_FILE)
gh.assertion(gh.non_empty_file(word_POS_freq_path))
word_POS_hash = read_word_POS_data(word_POS_freq_path)
label = "NN"
word = word.lower()
if (word in word_POS_hash):
label = word_POS_hash[word]
return label
#------------------------------------------------------------------------
# Utility functions
#
# TODO: make POS optional for is_POS-type functions (and use corpus frequencies to guess)
#
def is_noun(_token, POS):
"""Indicates if TOKEN is a noun, based on POS"""
return (POS[0:2] == "NN")
def is_verb(_token, POS):
"""Indicates if TOKEN is a verb, based on POS"""
# EX: is_verb('can', 'NN') => False
return (POS[0:2] == "VB")
def is_adverb(_token, POS):
"""Indicates if TOKEN is an adverb, based on POS"""
# EX: is_adverb('quickly', 'RB') => True
return (POS[0:2] == "RB")
def is_adjective(_token, POS):
"""Indicates if TOKEN is an adjective, based on POS"""
# EX: is_adverb('quick', 'JJ') => True
return (POS[0:2] == "JJ")
def is_comma(token, POS):
"""Indicates if TOKEN is a comma"""
return ((token == ",") or (POS[0:1] == ","))
def is_quote(token, _POS):
"""Indicates if TOKEN is a quotation mark"""
# Note: this includes checks for MS Word smart quotes because in training data
# TODO: make handled properly with respect to Unicode encoding (e.g., UTF-8)
return token in "\'\"\x91\x92\x93\x94"
def is_punct(token, POS=None):
"""Indicates if TOKEN is a punctuation symbol"""
# EX: is_punct('$', '$') => True
return (re.search("[^A-Za-z0-9]", token[0:1]) or
(POS and re.search("[^A-Za-z]", POS[0:1])))
# TODO: alternative to is_punct
## def is_punctuation(token, POS):
## """Indicates if TOKEN is a punctuation symbol"""
## # EX: is_punct('$', '$') => True
## # TODO: find definitive source (or use ispunct-type function)
## punctuation_chars_regex = r"[\`\~\!\@\#\$\%\^\&\*\(\)\_\-\+\=\{\}\[\]\:\;\"\'\<\>\,\.]"
## return (re.search(punctuation_chars_regex, token[0:1]) or (POS
## re.search("[^A-Za-z]", POS[0:1]))
#------------------------------------------------------------------------
def usage():
"""Displays command-line usage"""
usage = """
Usage: _SCRIPT_ [--help] [--just-tokenize] [--lowercase] file
Example:
echo "My dawg has fleas" | _SCRIPT_ -
Notes:
- Intended more as a library module
- In standalone mode it runs the text processing pipeline over the file:
sentence splitting, word tokenization, and part-of-speech tagging
- Set SKIP_NLTK environment variable to 1 to disable NLTK usage.
"""
# TODO: __file__ => sys.argv[1]???
tpo.print_stderr(usage.replace("_SCRIPT_", __file__))
return
def main():
"""
Main routine: parse arguments and perform main processing
TODO: revise comments
Note: Used to avoid conflicts with globals (e.g., if this were done at end of script).
"""
# Initialize
tpo.debug_print("main(): sys.argv=%s" % sys.argv, 4)
# Show usage statement if no arguments or if --help specified
just_tokenize = False
make_lowercase = False
show_usage = ((len(sys.argv) == 1) or (sys.argv[1] == "--help"))
arg_num = 1
while ((not show_usage) and arg_num < len(sys.argv) and (sys.argv[arg_num][0] == "-")):
if (sys.argv[arg_num] == "--just-tokenize"):
just_tokenize = True
elif (sys.argv[arg_num] == "--lowercase"):
make_lowercase = True
elif sys.argv[arg_num] == "-":
break
else:
tpo.print_stderr("Invalid argument: %s" % sys.argv[arg_num])
show_usage = True
arg_num += 1
if (show_usage):
usage()
sys.exit()
# Run the text from each file through the pipeline
for i in range(arg_num, len(sys.argv)):
# Input the entire text from the file (or stdin if - specified)
filename = sys.argv[i]
input_handle = open(filename, "r") if (filename != "-") else sys.stdin
# Analyze the text
while True:
text = input_handle.readline() if LINE_MODE else input_handle.read()
if text == "":
break
text = text.strip()
if just_tokenize:
# Just tokenize sentence and words
# Note: text lowercased at end to allow for case clues (e.g., "Dr. Jones")
for tokenized_line in tokenize_text(text):
tokenized_text = " ".join(tokenized_line)
if make_lowercase:
tokenized_text = tokenized_text.lower()
print(tokenized_text)
else:
# Show complete pipeline step-by-step
taggings = tokenize_and_tag(text)
# Show the results
# TODO: show original tags if class-based; put JUST_TAGS support in tag_part_of_speech
# TODO: flesh out support for verbose mode
if VERBOSE:
print("text: %s" % text)
if JUST_TAGS:
if VERBOSE:
print("tokens: %s" % OUTPUT_DELIM.join([w for (w, _POS) in taggings]))
print("tags: %s" % OUTPUT_DELIM.join([POS for (_w, POS) in taggings]))
else:
print(OUTPUT_DELIM.join([POS for (_w, POS) in taggings]))
else:
gh.assertion(VERBOSE)
print("taggings: %s" % taggings)
if SHOW_MISSPELLINGS:
misspellings = [w for (w, POS) in taggings if has_spelling_mistake(w)]
gh.assertion(VERBOSE)
print("misspellings: %s" % misspellings)
if VERBOSE:
print("")
# Cleanup
tpo.debug_print("stop %s: %s" % (__file__, tpo.debug_timestamp()), 3)
return
#------------------------------------------------------------------------
# Initialization
if __name__ == "__main__":
main()