-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtable_lookup.py
689 lines (592 loc) · 28.4 KB
/
table_lookup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
#! /usr/bin/env python
#
# Support for table lookup of named entities and other word lists. This
# supports incremental lookup as with tries or "slot hashes" to avoid having to do repeated
# lookups with similar prefixes.
#
# Notes:
# - Slot hashes are based on the approach used in the GATE NLP toolkit. See
# http://gate.ac.uk/sale/tao/splitch13.html
# - Includes workaround for picking different classes defined in same file:
# http://stefaanlippens.net/pickleproblem
# - An empty key is added to lookup the label for the data, which defaults
# to 'Value' if not specified (as with boolean indicator tables).
#
# TODO:
# - Test Bloom filters as alternative to slot hashes.
# - Look into hybrid hash/trie approach with latter only used for high-frequency phrases.
# - Define iterator and other common collection methods.
# - Make the interface more consistent (e.g., make all constructors accept filename for loading).
#
"""Generic table lookup using a variety of formats"""
import argparse
import os
import re
import sys
from abc import ABCMeta, abstractmethod
import tpo_common as tpo
import glue_helpers as gh
# TODO: Add descriptions for important options
TABLE_TYPES = ["slot-hash", "gate-hash", "char-trie", "pat-trie", "trie", "shelve", "kyoto"]
TABLE_TYPE = tpo.getenv_text("TABLE_TYPE", "slot-hash")
assert(TABLE_TYPE in TABLE_TYPES)
USE_SLOT_HASH = tpo.getenv_boolean("USE_SLOT_HASH", (TABLE_TYPE == "slot-hash"))
USE_CHAR_TRIE = tpo.getenv_boolean("USE_CHAR_TRIE", (TABLE_TYPE == "char-trie"))
USE_PATRICIA_TRIE = tpo.getenv_boolean("USE_PATRICIA_TRIE", (TABLE_TYPE == "pat-trie"))
USE_TRIE = tpo.getenv_boolean("USE_TRIE", (TABLE_TYPE.endswith("trie")) or USE_CHAR_TRIE or USE_PATRICIA_TRIE)
MAX_PHRASE_LEN = tpo.getenv_integer("MAX_PHRASE_LEN", 25)
USE_PHRASE_SLOTS = tpo.getenv_boolean("USE_PHRASE_SLOTS", (TABLE_TYPE == "gate-hash"))
USE_WORD_SLOTS = (not USE_PHRASE_SLOTS)
USE_SHELVE = tpo.getenv_boolean("USE_SHELVE", (TABLE_TYPE == "shelve"))
USE_KYOTO = tpo.getenv_boolean("USE_KYOTO", (TABLE_TYPE == "kyoto"))
USE_DB_HASH = (USE_SHELVE or USE_KYOTO)
BRUTE_FORCE = tpo.getenv_boolean("BRUTE_FORCE", USE_DB_HASH)
verbose = tpo.getenv_boolean("VERBOSE", False)
SET_FILE_MODULE = tpo.getenv_boolean("SET_FILE_MODULE", False)
DEFAULT_FILE_MODULE = "" if not SET_FILE_MODULE else os.path.splitext(os.path.basename(__file__))[0]
FILE_MODULE = tpo.getenv_text("FILE_MODULE", DEFAULT_FILE_MODULE).strip()
if USE_TRIE:
import trie
if USE_SHELVE:
import shelve
if USE_KYOTO:
import kyotocabinet as kc
#------------------------------------------------------------------------
class TableLookup(object):
"""Abstract class for table lookup with support for incremental search"""
__metaclass__ = ABCMeta
# note: As workaround for pickling problem, explicitly encode full module path.
if FILE_MODULE: __module__ = FILE_MODULE
def __init__(self, filename=None, overwrite=True):
"""Class constructor"""
# Note: overwrite applies to specializations requiring external data file
# such as shelve or kyoto, in constrast to tries and hashes which need explcit save.
tpo.debug_format("TableLookup.init([filename={f}, overwrite={ow}]); self=s", 7,
f=filename, ow=overwrite, s=self)
return
@abstractmethod
def dump(self, debug_level=7):
"""Traces out the object to stderr if DEBUG_LEVEL in effect"""
return
@abstractmethod
def count(self):
"""Returns number of items in table"""
return -1
@abstractmethod
def insert(self, words, value):
"""Insert key list of WORDS with VALUE into table"""
return
@abstractmethod
def flush(self):
"""Flush data to file"""
return
@abstractmethod
def lookup(self, words, context=None):
"""Return values for key list of WORDS, optionally relative to CONTEXT"""
return
@abstractmethod
def has_prefix(self, words, context=None):
"""Whether table has key prefixed with list of WORDS, optionally relative to CONTEXT"""
return
def starts_with(self, word, context=None):
"""Whether table has key with prefix WORD, optionally relative to CONTEXT"""
tpo.debug_print("Lookup.starts_with(%s, [%s])" % (word, context), 6)
return(self.has_prefix([word], context))
@abstractmethod
def current_value(self, context=None):
"""Return current value relative to CONTEXT"""
return
@abstractmethod
def keys(self):
"""Returns the list of keys in the table"""
return []
## TODO:
## @abstractmethod
## def close(self):
## """Close file handle"""
## return
class TrieLookup(TableLookup):
"""Trie-based implementation of table lookup"""
# note: As workaround for pickling problem, explicitly encode full module path.
if FILE_MODULE: __module__ = FILE_MODULE
def __init__(self, filename=None, overwrite=True):
tpo.debug_format("TrieLookup.__init__([filename={f}, overwrite={ow}])\n", 6,
f=filename, ow=overwrite)
self.trie = trie.Trie(compressed=USE_PATRICIA_TRIE)
return
def dump(self, debug_level=7):
"""Traces out the object to stderr if DEBUG_LEVEL in effect"""
tpo.debug_print(self.trie.format(), debug_level)
return
def count(self):
"""Returns number of items in table"""
return self.trie.size()
def insert(self, words, value):
"""Insert key list of WORDS with VALUE into table"""
tpo.debug_print("TrieLookup.insert(%s, %s)" % (words, value), 6)
if USE_CHAR_TRIE:
words = " ".join(words)
self.trie.insert(words, value)
return
def flush(self):
"""Flush data to file"""
return
def lookup(self, words, context=None):
"""Return value for key list of WORDS, optionally relative to CONTEXT (i.e., starting subtrie)"""
## Note: returns first value if more than one stored
if USE_CHAR_TRIE:
words = " ".join(words)
start_trie = self.has_prefix(words, context)
value = start_trie.current_value() if start_trie else None
tpo.debug_print("TrieLookup.lookup(%s, [%s]) => %s" % (words, context, value), 5)
return value
def has_prefix(self, words, context=None):
"""Whether table has key prefixed with list of WORDS, optionally relative to CONTEXT (i.e., starting subtrie)"""
if USE_CHAR_TRIE:
words = " ".join(words)
start_trie = context if context else self.trie
if start_trie != self.trie:
tpo.debug_print("start_trie: %s" % start_trie.format(), 9)
# Find subnode corresponding to word sequence and make sure values at that node
sub_trie = start_trie.find_prefix(words)
tpo.debug_print("sub_trie: %s" % sub_trie and sub_trie.format(), 8)
if (sub_trie and (len(sub_trie.get_all_values()) > 0)):
tpo.debug_print("sub-trie values: %s; children-keys: %s" % (list(sub_trie.get_all_values()), sub_trie.children.keys()), 6)
else:
tpo.debug_print("TrieLookup.has_prefix: ignoring internal node (%s)" % sub_trie, 6)
sub_trie = None
tpo.debug_print("TrieLookup.has_prefix(%s, [%s]) => %s" % (words, context, sub_trie), 5)
return (sub_trie)
def current_value(self, context=None):
"""Return current value stored at CONTEXT (i.e., starting subtrie), using first if more than one"""
start_trie = context if context else self.trie
current_values = list(start_trie.get_all_values())
value = current_values[0] if current_values else None
return (value)
def keys(self):
"""Returns the list of keys in the trie"""
return self.trie.get_each_key()
class HashSlotLookup(TableLookup):
"""Hash-slot implementation of table lookup, inspired by GATE (see http://gate.ac.uk/sale/tao/splitch13.html). Their approach uses separates hashes for each possible prefix of subwords. That is supported as well as an expedient that just uses slot hashes to see whether any lookup entry has a word at a given position."""
# note: As workaround for pickling problem, explicitly encode full module path.
if FILE_MODULE: __module__ = FILE_MODULE
def __init__(self, filename=None, overwrite=True, num_slots=10):
"Initializes hash-slot lookup table with NUM_SLOTS of word-prefix hashes (0 for non-incremental lookup)"
tpo.debug_format("HashSlotLookup.__init__([filename={f}, overwrite={ow}, [num_slots={ns})\n", 6,
f=filename, ow=overwrite, ns=num_slots)
self.num_slots = num_slots
self.subword_hash = []
for _i in range(self.num_slots + 1):
self.subword_hash.append({})
## TODO: self.remainder_hash = {}
self.full_hash = {}
gh.assertion(USE_SLOT_HASH or USE_PHRASE_SLOTS)
return
def dump(self, debug_level=7):
tpo.debug_print("HashSlotLookup: {", debug_level)
tpo.debug_print(" num_slots=%s" % self.num_slots, debug_level)
for i in range(self.num_slots):
tpo.debug_print(" subword_hash[%d]=%s" % (i, self.subword_hash[i]), debug_level)
## TODO: tpo.debug_print(" remainder_hash=%s" % self.remainder_hash, debug_level)
tpo.debug_print(" full_hash=%s" % self.full_hash, debug_level)
tpo.debug_print(" }", debug_level)
return
def count(self):
return len(self.full_hash)
def insert(self, words, value):
tpo.debug_print("HashSlotLookup.insert(%s, %s)" % (words, value), 6)
# Insert complete phrase into full hash
phrase = " ".join(words)
self.full_hash[phrase] = value
# Set indicator hashes for each subword
num_key_slots = min(len(words), self.num_slots)
for i in range(num_key_slots):
key = words[i] if USE_WORD_SLOTS else " ".join(words[:(i + 1)])
self.subword_hash[i][key] = True
# Add remaining subphrase to catchall hash
## TODO: if (num_key_slots < len(words)):
## TODO: self.remainder_hash[" ".join(words[num_key_slots:])] = True
if (num_key_slots < len(words)):
tpo.debug_print("Warning: too many words for slot-hash lookup: %s" % words)
return
def flush(self):
"""Flush data to file"""
return
def lookup(self, words, context=None):
assert(not context)
value = None
phrase = " ".join(words)
if phrase in self.full_hash:
value = self.full_hash[phrase]
tpo.debug_print("HashSlotLookup.lookup(%s) => %s" % (words, value), 5)
return (value)
def has_prefix(self, words, context=None):
"""Whether table has key prefixed with list of WORDS, optionally relative to CONTEXT (e.g., starting slot)"""
found = True
# Special case of phrase slots (approach used in GATE)
# note: context is list of previous words found (not slot number as usual)
if USE_PHRASE_SLOTS:
new_context = context + words if context else words
if (len(words) < self.num_slots):
key = " ".join(new_context)
slot = len(new_context) - 1
if (key not in self.subword_hash[slot]):
tpo.debug_print("key '%s' not in slot %d" % (key, slot), 6)
new_context = None
tpo.debug_print("HashSlotLookup.has_prefix(%s, [%s]) => %s" % (words, context, new_context), 5)
return (new_context)
# Check each word for occurrence in slot-specific hash
start_slot = context if context else 0
last_slot_plus = min(start_slot + len(words), self.num_slots)
i = 0
while ((start_slot + i) < last_slot_plus):
if (words[i] not in self.subword_hash[start_slot + i]):
found = False
tpo.debug_print("Word '%s' not found in slot %d hash" % (words[i], i), 6)
break
i += 1
# TODD: Check remaining words in catchall hash
# if (found) and ((start_slot + i) == self.num_slots) and (i < len(words)):
# remainder = " ".join(words[i:])
# if remainder not in self.remainder_hash:
# found = False
# tpo.debug_print("Remainder '%s' not found in remainder hash" % remainder, 6)
if (found) and ((start_slot + i) == self.num_slots) and (i < len(words)):
found = False
# Returns index of next slot if found, otherwise null
new_context = (start_slot + i) if found else None
tpo.debug_print("HashSlotLookup.has_prefix(%s, [%s]) => %s" % (words, context, new_context), 5)
return new_context
def current_value(self, context=None):
"""Current value not applicable for slot-hash lookup"""
return None
def keys(self):
return self.full_hash.keys()
class HashDbLookup(TableLookup):
"""Table lookup via hash-like object cached to disk"""
# Note: only supports brute-force lookup (i.e., non-incremental)
def __init__(self, filename="table_lookup.hash-db.data", overwrite=True):
tpo.debug_format("HashDbLookup.__init__([filename={f}, overwrite={ow}])\n", 6,
ow=overwrite, f=filename)
self.filename = filename
return
def has_prefix(self, words, context=None):
"""Whether table has key prefixed with list of WORDS, optionally relative to CONTEXT"""
tpo.print_stderr("has_prefix not supported for hash-db tables")
return None
def starts_with(self, word, context=None):
"""Whether table has key with prefix WORD, optionally relative to CONTEXT"""
tpo.print_stderr("starts_with not supported for hash-db tables")
return None
def current_value(self, context=None):
"""Return current value relative to CONTEXT"""
tpo.print_stderr("current_value not supported for hash-db tables")
return None
@classmethod
def from_hash(cls, hash_filename, hash_db_filename):
"""Converts hash-based look into db-hash
Note: currently just intended for interactive use"""
# TODO: rework to use serialized hash
hash_db = HashDbLookup(hash_db_filename)
# Note: uses create_lookup_table, which is just for hashes
hash_table = tpo.create_lookup_table(hash_filename)
for (k, v) in enumerate(hash_table):
hash_db.insert(k, v)
return hash_db
class ShelveLookup(HashDbLookup):
"""Table lookup via python shelve (db-backed hash)"""
# note: As workaround for pickling problem, explicitly encode full module path.
if FILE_MODULE: __module__ = FILE_MODULE
def __init__(self, filename="table_lookup.shelve.data", overwrite=True):
tpo.debug_format("ShelveLookup.__init__([filename={f}, overwrite={ow}])\n", 6,
ow=overwrite, f=filename)
if overwrite and os.path.exists(filename):
gh.delete_file(filename)
if verbose:
action = "Loading" if os.path.exists(filename) else "Saving"
print("{act} table {f}".format(act=action, f=filename))
self.data_store = shelve.open(filename)
return
def dump(self, debug_level=7):
"""Traces out the object to stderr if DEBUG_LEVEL in effect (and __debug__)"""
tpo.debug_print("ShelveLookup: data_store=%s" % self.data_store, debug_level)
return
def count(self):
"""Returns number of items in table"""
# Note: not supported as this requires bringing all of the stored data into memory
return -1
def insert(self, words, value):
"""Insert key list of WORDS with VALUE into table"""
tpo.debug_print("ShelveLookup.insert(%s, %s)" % (words, value), 6)
phrase = " ".join(words)
self.data_store[phrase] = value
return
def flush(self):
"""Flush data to file"""
return
def lookup(self, words, context=None):
"""Return values for key list of WORDS, optionally relative to CONTEXT"""
phrase = " ".join(words)
value = self.data_store[phrase] if (phrase in self.data_store) else None
tpo.debug_print("ShelveLookup.lookup(%s) => %s" % (words, value), 5)
return (value)
def keys(self):
"""Returns list of keys in the table"""
return self.data_store.keys()
class KyotoLookup(HashDbLookup):
"""Table lookup via python kyoto (db-backed hash)"""
KCT = "kct"
KCT_EXT = "." + KCT
if FILE_MODULE: __module__ = FILE_MODULE
def __init__(self, filename=None, overwrite=True):
"""Class constructor: includes symbolic link hack for embedded period issue in filename"""
tpo.debug_format("KyotoLookup.__init__([filename={f}, overwrite={ow}])\n", 6,
ow=overwrite, f=filename)
if not filename:
filename = "table_lookup_kyoto" + self.KCT_EXT
self.data_store = kc.DB()
file_mode = None
# Normalize the filename for use with quirky Kyoto
## HACK: ensure no embedded periods (and use symbolic link for user name)
internal_filename = filename
if not internal_filename.endswith(self.KCT_EXT):
internal_filename += self.KCT_EXT
internal_filename = internal_filename.replace(".", "_")
internal_filename = internal_filename.replace("_" + self.KCT, self.KCT_EXT)
tpo.debug_format("internal_filename={f}", 5, f=internal_filename)
gh.assertion(internal_filename.endswith(self.KCT_EXT))
# Determine the file mode
file_mode = None
if overwrite:
file_mode = (kc.DB.OWRITER | kc.DB.OCREATE)
if os.path.exists(internal_filename):
gh.delete_file(internal_filename)
if verbose:
print("Saving table {f}".format(f=internal_filename))
else:
if verbose:
print("Loading table {f}".format(f=internal_filename))
file_mode = kc.DB.OREADER
# Open the file and create symbolic link to user name
open_ok = self.open(internal_filename, file_mode)
gh.assertion(open_ok)
if overwrite and (internal_filename != filename):
tpo.debug_format("Warning: Creating symbolic link to internal file for Kyoto", 3,
int=internal_filename)
gh.run("ln -fs {int} {f}", f=filename, int=internal_filename)
return
def open(self, filename, mode=None):
"""Open FILENAME in MODE"""
if mode is None:
mode = kc.DB.OREADER
ok = self.data_store.open(filename, mode)
tpo.debug_format("KyotoLookup.open({f}, {m}) => {r}; self={s}", 6,
f=filename, m=mode, r=ok, s=self)
return ok
def close(self):
"""Open FILENAME in MODE"""
tpo.debug_format("KyotoLookup.close(); self={s}", 6, s=self)
return self.data_store.close()
def dump(self, debug_level=7):
"""Traces out the object to stderr if DEBUG_LEVEL in effect (and __debug__)"""
tpo.debug_print("KyotoLookup: data_store=%s" % self.data_store, debug_level)
# TODO: use iterate method
# dump= []; db.iterate(lambda k,v: dump.append((k, v)))'
# see https://github.com/KosyanMedia/kyotocabinet-python/blob/master/kyotocabinet-doc.py
return
def count(self):
"""Returns number of items in table"""
num = len(self.data_store)
tpo.debug_format("KyotoLookup.coount() => {n}", 6, n=num)
return num
def insert(self, words, value):
"""Insert key list of WORDS with VALUE into table"""
tpo.debug_print("KyotoLookup.insert(%s, %s)" % (words, value), 6)
phrase = " ".join(words)
insert_ok = self.data_store.add(phrase, value)
gh.assertion(insert_ok)
# TODO: return insert_ok
return
def lookup(self, words, context=None):
"""Return values for key list of WORDS, optionally relative to CONTEXT"""
phrase = " ".join(words)
value = self.data_store.get(phrase)
tpo.debug_print("KyotoLookup.lookup(%s) => %s" % (words, value), 5)
return (value)
def flush(self):
"""Flush data to file"""
tpo.debug_format("KyotoLookup.flush(); self={s}", 6, s=self)
self.data_store.synchronize()
return
def keys(self):
"""Returns the list of keys in the table"""
key_names = []
def lookup(key, _value):
"""Helper for kytooto iterate"""
key_names.append(key)
self.data_store.iterate(lookup)
return key_names
#------------------------------------------------------------------------
def read_lookup_table(table, filename):
"""Populate TABLE with entries from FILENAME.
Input Format: Key[<tab><Value>]"""
f = None
try:
f = open(filename)
for line_num, line in enumerate(f):
## OLD: line = line.strip().lower()
tpo.debug_print("table L%d: %s" % (line_num + 1, line), 7)
# Check for phrase with optional tab-separated value
match = re.match("^([^\t]*)(\t(.*))?", line)
if match:
## OLD: key = match.group(1)
key = match.group(1).strip().lower()
value = match.group(3) if match.group(3) else True
if (line_num == 0):
# Add special entry to indicate label for value
label = value if (value != True) else "Value"
table.insert([""], label)
words = re.split(r"\s+", key)
# Add entry unless exceeds length limit
if (len(words) <= MAX_PHRASE_LEN):
table.insert(words, value)
else:
tpo.print_stderr("Max phrase length ({max}) exceeded in key ({len}): {phr}",
max=MAX_PHRASE_LEN, key=len(words), phr=words)
else:
tpo.debug_print("Ignoring line %d of %s: %s" % (line_num + 1, line, filename))
except (IOError, ValueError):
tpo.debug_print("Warning: Exception reading lookup table from %s: %s" % (filename, str(sys.exc_info())), 2)
finally:
if f:
f.close()
return
def open_lookup_table(filename):
"""Open FILENAME of type secified by environemt ooptions lie USE_KYOTO"""
if USE_SHELVE:
lookup_class = ShelveLookup
elif USE_KYOTO:
lookup_class = KyotoLookup
elif USE_TRIE:
lookup_class = TrieLookup
else:
lookup_class = HashSlotLookup
table = lookup_class(overwrite=False, filename=filename)
tpo.debug_format("open_lookup_table({f}) => {t}", 4, f=filename, t=table)
return table
def create_serialized_lookup_table(input_filename, save_filename=None, load_filename=None):
"""Create lookup table from term mappings in INPUT_FILENAME (one pair per line), using lowercase keys. The data is object written to SAVE_FILENAME (e.g., pickle format). If the input filename is -, then the table is loaded from LOAD_FILENAME"""
# Note: for transparent handling of shelve-based tables, the input filename can be specified as - for n/a
tpo.debug_print("create_serialized_lookup_table(%s, [save_filename=%s], [load_filename=%s])" % (input_filename, save_filename, load_filename), 4)
if USE_SHELVE:
table = ShelveLookup(save_filename or load_filename, overwrite=save_filename)
elif USE_KYOTO:
table = KyotoLookup(save_filename or load_filename, overwrite=save_filename)
## TODO: table.flush()
## TEST: table.close(); table.open(save_filename or load_filename)
else:
table = TrieLookup() if USE_TRIE else HashSlotLookup()
if (input_filename != "-"):
read_lookup_table(table, input_filename)
tpo.trace_object(table, 9, "table")
if save_filename and not USE_DB_HASH:
if verbose:
print("Saving table {f}".format(f=save_filename))
tpo.store_object(save_filename, table)
else:
if load_filename and not USE_DB_HASH:
if verbose:
print("Loading table {f}".format(f=load_filename))
table = tpo.load_object(load_filename)
table.dump()
return table
def verify_table_lookup(table):
"""Verifies that each entry from STDIN can be successfully retrieved from TABLE"""
# Search for items from table occurring in input
tpo.debug_print("Verifing table lookup", tpo.ALWAYS)
num_found = 0
line_num = 0
for line in sys.stdin.readlines():
line = line.strip()
line_num += 1
## TODO: debug_print("L%d: %s" % (fileinput.filelineno(), line), 5)
tpo.debug_print("L%d: %s" % (line_num, line), 5)
terms = re.split(r"\s+", line.lower())
start = 0
found_info = []
while (start < len(terms)):
phrase_len = 1
# See if a known phrase starts with the current word.
# If using brute-force lookup, assume all of remainder matches.
context = None
if BRUTE_FORCE:
phrase_len = len(terms) - start
else:
# Start incremental search if current word starts a phrase
context = table.starts_with(terms[start], None)
if (context):
# Continue adding words while current one might continue a known phrase
while (context and ((start + phrase_len) < len(terms))):
context = table.starts_with(terms[start + phrase_len], context)
if context:
phrase_len += 1
else:
phrase_len = 0
# Verify maximal phrase as lookahead might be heuristic
while (phrase_len > 0):
## OLD: subphrase = " ".join(terms[start:(start + phrase_len)])
words = terms[start:(start + phrase_len)]
value = None
if context:
value = table.current_value(context)
if not value:
value = table.lookup(words)
if value:
phrase = " ".join(words)
info = phrase if value is True else (phrase, value)
found_info.append(info)
num_found += 1
break
phrase_len -= 1
start += max(1, phrase_len)
# Output matching subphrases
print("%s => %s" % (line, found_info))
# Display some statistics on the lookup results
print("%d subphrases from %d input lines found" % (num_found, line_num))
def main():
"""Entry point for script"""
# Parse arguments: table-data search-data
env_options = tpo.formatted_environment_option_descriptions()
notes = """
Notes:
- In most cases, external data tables must explicitly be created via --save.
- The following environment options are available:
{env_opts}
""".format(env_opts=env_options)
parser = argparse.ArgumentParser(description=__doc__, epilog=notes,
formatter_class=argparse.RawDescriptionHelpFormatter)
## TODO: use lookup-file
parser.add_argument("lookup_file", help="file with keys and values (e.g., named entities and weights)")
## TEST: parser.add_argument("lookup-file", help="file with known entities and weight")
parser.add_argument("--save-file", help="file for storing lookup-table instance")
parser.add_argument("--load-file", help="file for loading lookup-table instance")
parser.add_argument("--verbose", default=False, action='store_true', help="Verbose output mode")
parser.add_argument("--skip-test", default=False, action='store_true', help="skip the verification search for each item in table")
parser.add_argument("--print-table", default=False, action='store_true', help="print out the table in plain text format")
## OLD: parser.add_argument("search_file", help="file with new cases to lookup")
# TODO: show environment options
args = parser.parse_args()
tpo.debug_print("args: %s" % args, 4)
global verbose
verbose = args.verbose
# Create (or load) slot-hash or trie for incremental lookup (or shelve for exact lookup)
table = create_serialized_lookup_table(args.lookup_file, args.save_file, args.load_file)
if (not args.skip_test):
verify_table_lookup(table)
if (args.print_table):
for k in table.keys():
print("%s\t%s" % (k, table.lookup(k)))
return
#------------------------------------------------------------------------
if __name__ == '__main__':
main()