-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgensim_test.py
456 lines (399 loc) · 22 KB
/
gensim_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
#! /usr/bin/env python
#
# gensim_test.py: Tests features of gensim document processing, such ad TF/IDF-based term vector
# representation and documented similarity computations. This takes as input a text fike where each
# line represents a separate document.
#
# Notes:
# - gensim is short for 'generate similar' (see http://radimrehurek.com/gensim/about.html)
# - Based on http://radimrehurek.com/gensim/tut1.html.
# - This handles the conversion into gensim's vector representation from raw text.
# - See perform_lsa.py for script that performs latent semantic analsis using gensim.
# - The output dictionary is compressed by default (via bzip2) as with Gensim script
# for wikipedia formatting.
# - See google_word2vec.py for script that supports term similarity instead of document similarity.
#
# TODO:
# - Use token quoting consistent with Gensim topic display (i.e., double).
# - Work around quirk requiring the model to be saved prior to similarity calculations.
#
"""Interface into Gensim package for vector-based text analysis (e.g., document similarity)"""
import argparse
import logging
import os
import re
import sys
import tempfile
from gensim import corpora, models, similarities
import tpo_common as tpo
import glue_helpers as gh
MAX_SIMILAR = tpo.getenv_integer("MAX_SIMILAR", 10)
DOCID_FILENAME = tpo.getenv_text("DOCID_FILENAME", None)
# TODO: default to number of CPU's
PARALLEL_SHARDS = tpo.getenv_integer("PARALLEL_SHARDS", 1)
#
# The following are for pruning dictionary
MIN_NUM_DOCS = tpo.getenv_integer("MIN_NUM_DOCS", None)
MAX_PCT_DOCS = tpo.getenv_number("MAX_PCT_DOCS", None)
MAX_NUM_TOKENS = tpo.getenv_integer("MAX_NUM_TOKENS", None)
#------------------------------------------------------------------------
class CorpusData(object):
"""Class for processing corpora with gensim (based on MyCorpus from gensim samples)"""
# TODO: isolate class into separate module
def __init__(self, text=None):
"""Constructor: initialize dictionary mapping for terms"""
tpo.debug_print("CorpusData.__init__(%s)" % text, 6)
self.text = text # file representing entire corpus: one line per document
self.mm = None # matrix market format (see http://math.nist.gov/MatrixMarket/formats.html)
if (self.text): # mapping from words to token IDs
self.dictionary = create_dictionary(self.text)
return
def load(self, basename):
"""Load corpus model and dictionary from disk using BASENAME"""
tpo.debug_print("CorpusData.load(%s)" % basename, 6)
assert(self.text is None)
self.dictionary = corpora.Dictionary.load_from_text(basename + '.wordids.txt.bz2')
# TODO: make sure the document index gets loaded (for random access)
self.mm = corpora.MmCorpus(basename + '.bow.mm')
tpo.trace_object(self.mm, 7, "mm")
return
def __iter__(self):
"""Returns iterator over vectors in corpus or over lines in input text"""
tpo.debug_print("CorpusData.__iter__()", 6)
if (self.mm):
for vector in self.mm.__iter__():
yield vector
else:
for line in open(self.text):
# note: assumes there's one document per line with tokens separated by whitespace
yield self.dictionary.doc2bow(line.lower().split())
return
def __len__(self):
"""Returns number of documents in corpus"""
num_docs = len(self.mm) if self.mm else self.text_length()
tpo.debug_print("CorpusData.__len__() => %d" % num_docs, 7)
return (num_docs)
def text_length(self):
"""Returns number of documents (i.e., lines) in text"""
length = 0
for _line in open(self.text):
length += 1
tpo.debug_print("CorpusData.text_length() => %d" % length, 7)
return (length)
def __getitem__(self, index):
"""Returns corpus item at INDEX (0-based)"""
result = None
try:
result = self.mm[index]
except RuntimeError:
tpo.print_stderr("Warning: falling back to linear document access (i.e., not random access): make sure index file exists")
result = None
for i, value in enumerate(self.mm):
if (i == index):
result = value
break
return (result)
#------------------------------------------------------------------------
class UserIdMapping(object):
"""Class for mapping Gensim document ID's into user ID's"""
def __init__(self, docid_filename):
"""Class constructor"""
self.docid_mapping = tpo.create_lookup_table(docid_filename, use_linenum=True)
# TODO: just use array indexing (e.g., doc_positions = docid_mapping.values())
self.reverse_docid_mapping = dict((docid, key) for (key, docid) in list(self.docid_mapping.items()))
return
def get_user_id(self, docid):
"""Returns user ID for DOCID (same as input if no mapping exists)"""
user_id = self.docid_mapping.get(docid, docid)
gh.assertion(user_id != docid)
tpo.debug_format("get_user_id({docid}) => {user_id}", 6)
return user_id
def get_gensim_id(self, docid):
"""Returns gensim ID for DOCID (same as input if no mapping exists)"""
gensim_id = self.reverse_docid_mapping.get(docid, docid)
gh.assertion(gensim_id != docid)
tpo.debug_format("get_gensim_id({docid}) => {gensim_id}", 6)
return gensim_id
#------------------------------------------------------------------------
class SimilarDocument(object):
"""Base class for finding similar documents via vector-space cosine measure"""
# TODO: rework so that CorpusData class used to encapsulate both corpus and dictionary
def __init__(self, corpus=None, dictionary=None, verbose_output=False, max_similar=MAX_SIMILAR, docid_filename=DOCID_FILENAME):
"""Class constructor"""
tpo.debug_format("SimilarDocument.__init__({corpus}, {dictionary}, {verbose_output}, {max_similar}, {docid_filename})", 6)
# If specified, override the number of CPUs for parallel processing of shards
if (PARALLEL_SHARDS > 1):
similarities.docsim.PARALLEL_SHARDS = PARALLEL_SHARDS
self.corpus = corpus
self.dictionary = dictionary
self.sim_index = None
if self.corpus:
gh.assertion(isinstance(self.corpus, corpora.MmCorpus))
if self.dictionary:
gh.assertion(isinstance(self.dictionary, corpora.Dictionary))
self.verbose_output = verbose_output
self.max_similar = max_similar
self.docid_mapping = UserIdMapping(docid_filename) if docid_filename else None
return
def get_user_id(self, docid):
"""Returns user ID for DOCID (same as input if no mapping exists)"""
return (self.docid_mapping.get_user_id(docid) if self.docid_mapping else docid)
def get_gensim_id(self, docid):
"""Returns gensim ID for DOCID (same as input if no mapping exists)"""
return (self.docid_mapping.get_gensim_id(docid) if self.docid_mapping else docid)
def find(self, _docid):
""""Return documents similar to DOCID; result is a list of tuples: (docid, weight)"""
tpo.debug_format("SimilarDocument.find({d}); self={s}", 6,
d=_docid, s=self)
assert(False)
return []
def find_all_similar(self):
"""Iterator for getting list of similar documents for each document: each result is a tuple (docid, similar-doc-list), with similar-doc-list a list of (other-docid, weight) tuples"""
tpo.debug_print("SimilarDocument.find_all_similar", 5)
for docid in range(len(self.corpus)):
yield(docid, self.find(docid))
return
# def load(self, filename):
# """Loads similarity model from FILENAME"""
# tpo.debug_print("Loading similiarity index from FILENAME", 4)
# self.sim_index = similarities.Similarity.load(filename)
# return
def save(self, filename):
"""Saves similarity model to FILENAME"""
tpo.debug_format("Saving similiarity index to {filename}", 4)
return (self.sim_index.save(filename))
class SimilarDocumentByCosine(SimilarDocument):
"""Class for finding similar documents via vector-space cosine measure"""
def __init__(self, corpus=None, dictionary=None, index_file=None, verbose_output=False, max_similar=MAX_SIMILAR, docid_filename=DOCID_FILENAME):
"""Class constructor"""
tpo.debug_format("SimilarDocumentByCosine.__init__({corpus}, {dictionary}, {index_file}, {verbose_output}, {max_similar}, {docid_filename})", 6)
# note: index_file serves both as the cache for the similarity object as well as base name for the shards it uses (see gensim's docsim.py)
# TODO: rework so that corpus and dictionary not needed to retrieve pre-computed similarity results
SimilarDocument.__init__(self, corpus, dictionary, verbose_output, max_similar, docid_filename)
## BAD: if (self.corpus and self.dictionary):
if ((self.corpus is not None) and (self.dictionary is not None)):
if (gh.non_empty_file(index_file)):
self.sim_index = similarities.Similarity.load(index_file)
# Make sure shard file prefix matches index file
if self.sim_index.output_prefix != index_file:
tpo.debug_format("Updating shard file prefix: {self.sim_index.output_prefix} => {index_file}", 5)
self.sim_index.output_prefix = index_file
self.sim_index.check_moved()
else:
self.sim_index = similarities.Similarity(index_file, self.corpus, len(self.dictionary), max_similar)
tpo.debug_print("sim_index: type=%s value=%s" % (type(self.sim_index), self.sim_index), 5)
else:
tpo.debug_format("c={self.corpus} d={self.dictionary} sim_index={self.sim_index}", 6)
tpo.trace_object(self, 5, "SimilarDocumentByCosine.self")
return
def normalize_score(self, score):
"""Normalized from cosine value in range [-1, 1] to probability-type score in range [0, 1])"""
# pylint: disable=no-self-use
MIN_SCORE = -1.0
MAX_SCORE = 1.0
EPSILON = 0.001
gh.assertion(MIN_SCORE-EPSILON <= score <= MAX_SCORE+EPSILON)
normal_score = (float(score - MIN_SCORE) / (MAX_SCORE - MIN_SCORE))
tpo.debug_format("SimilarDocumentByCosine.normalize_score({score}) => {normal_score}", 6)
return (normal_score)
def find(self, docid):
""""Return documents similar to DOCID; result is a list of tuples: (docid, weight)"""
tpo.debug_format("SimilarDocumentByCosine.find({docid})", 5)
gh.assertion(self.corpus and self.dictionary and self.sim_index)
gensim_docid = self.get_gensim_id(docid)
try:
similar_gensim_docs = self.sim_index[self.corpus[int(gensim_docid)]]
similar_docs = [(self.get_user_id(doc), self.normalize_score(score)) for (doc, score) in similar_gensim_docs]
if self.verbose_output:
similar_docs = [(docid, score, resolve_terms(docid, self.dictionary)) for (docid, score) in similar_docs]
except:
tpo.debug_raise()
tpo.print_stderr("Exception retrieving similar documents: " + str(sys.exc_info()))
similar_docs = []
result = similar_docs
tpo.debug_format("find({docid}) => {result}", 5)
return result
def derive_all_similarities(self):
"""Precompute similarities, using batch method via chunking (see Gensim documentation in docsim.py)."""
tpo.debug_format("SimilarDocumentByCosine.derive_all_similarities()", 5)
_all_sim = list(self.sim_index[self.corpus])
return
#------------------------------------------------------------------------
def create_dictionary(filename):
"""Create dictionary with mord mappings and frequencies from FILENAME with each line representing a separate document"""
# Note: Updates the dictionary for each line (to allow for very large corpora files).
tpo.debug_print("create_dictionary(%s)" % filename, 5)
dictionary = corpora.Dictionary()
for line in open(filename):
## OLD: line_tokenized = [w for w in re.split(r"\W+", line)]
line_tokenized = re.split(r"\W+", line)
dictionary.doc2bow(line_tokenized, allow_update=True)
return (dictionary)
def resolve_terms(vector, dictionary):
"""Return vector with token ID's replaced by the actual tokens"""
tpo.debug_print("resolve_terms(%s, %s)" % (vector, dictionary), 7)
term_vector = [(dictionary[token_id], count) for (token_id, count) in vector]
return sorted(term_vector, reverse=True, key=lambda _term, freq: freq)
def main():
"""Entry point for script"""
tpo.debug_print("main(): sys.argv=%s" % sys.argv, 4)
# Check command-line arguments
# TODO: make sure each argument supported via external API
parser = argparse.ArgumentParser(description="Creates gensim corpus files optionally with TF/IDF weighting (for use with perform_lsa.py. Note: input should be a text file when creating from scratch or the basename of model files if loading existing model(s).")
#
parser.add_argument("--save", default=False, action='store_true', help="Save model(s) to disk")
parser.add_argument("--load", default=False, action='store_true', help="Load model(s) from disk")
parser.add_argument("--tfidf", default=False, action='store_true', help="Include TF/IDF analysis")
parser.add_argument("--original", default=False, action='store_true', help="Output original document term matrix (i.e., non-tfidf) when --tfidf specified")
parser.add_argument("--similarity", default=False, action='store_true', help="Derive similarity data")
parser.add_argument("--print", default=False, action='store_true', help="Print vectors on standard output")
parser.add_argument("--expand", default=False, action='store_true', help="Expand corpus in memory")
parser.add_argument("--verbose", default=False, action='store_true', help="Verbose output mode (e.g., resolve term ID's)")
parser.add_argument("--normalize", default=True, action='store_true', help="Normalize TF/IDF scores")
parser.add_argument("--skip-normalize", dest='normalize', action='store_false', help="Normalize TF/IDF scores")
parser.add_argument("--similar-docs-of", default="", help="Show similar documents for list of document ID's (or * for all); note: currently requires --load")
parser.add_argument("--max-similar", type=int, default=MAX_SIMILAR, help="Maximum number of similar documents to return")
parser.add_argument("--output-basename", default="", help="Basename to use for output (by default input file without .txt extension)")
parser.add_argument("--docid-filename", default=None, help="Filename with document ID's")
parser.add_argument("--prune-dictionary", default=False, action='store_true', help="Prune dictionary of low/high frequency terms")
#
# note: filename is positional argument
parser.add_argument("filename", default=None, help="Input data filename (or basename when loading previously saved model); use - for stdin")
#
args = vars(parser.parse_args())
tpo.debug_print("args = %s" % args, 5)
filename = args['filename']
save = args['save']
load = args['load']
perform_tfidf = args['tfidf']
verbose_output = args['verbose']
normalize = args['normalize']
print_vectors = args['print']
show_original = (args['original'] or (print_vectors and (not perform_tfidf)))
expand_corpus = args['expand']
derive_similarity = args['similarity']
max_similar = args['max_similar']
source_similar_docs = args['similar_docs_of'].replace(",", " ").split()
output_basename = args['output_basename']
docid_filename = args['docid_filename']
prune_dictionary = args['prune_dictionary']
temp_file = None
# Map stdin to temporary file
# TODO: rework in terms of streaming (e.g., for files > 4gb)
if (filename == "-"):
tpo.debug_print("Reading input from stdin", 4)
temp_file = tpo.getenv_text("TEMP_FILE", tempfile.NamedTemporaryFile().name)
filename = temp_file + ".txt"
gh.write_file(filename, gh.read_file(None))
# Derive the basename if not given
# TODO: TFIDF_MODEL_EXT = ".tfidf.mm"
matrix_ext = ".tfidf.mm" if perform_tfidf else ".bow.mm"
input_extension = ".txt" if (not load) else matrix_ext
if not output_basename:
output_basename = gh.remove_extension(filename, input_extension)
tpo.debug_print("output_basename=%s" % output_basename, 5)
# Make sure input file exists
if (not os.path.exists(filename)) and (not load):
if (os.path.exists(filename + input_extension)):
filename += input_extension
# Enable logging if debugging
# TODO: tpo.init_logging()
if (tpo.debugging_level()):
# TODO: use mapping from symbolic LEVEL user option (e.g., via getenv)
level = logging.INFO if (tpo.debug_level < 4) else logging.DEBUG
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=level)
# Read in corpus, optionally in saved vector format
# TODO: put load/save support into CorpusData (and isolate as gensim_corpus.py)
# TODO: bypass loading if TF/IDF version of corpus is being used for document similarity
if load:
corpus_basename = tpo.getenv_text("CORPUS_BASENAME", filename)
corpus_data = CorpusData()
corpus_data.load(corpus_basename)
else:
corpus_data = CorpusData(filename)
tpo.debug_print("corpus_data: type=%s value=%s" % (type(corpus_data), corpus_data), 5)
# Optionally prune low and high frequency terms from dictionary
if (prune_dictionary or MIN_NUM_DOCS or MAX_PCT_DOCS or MAX_NUM_TOKENS):
tpo.debug_print("Pruning dictionary", 4)
option_overrides = {}
if MIN_NUM_DOCS:
option_overrides['no_below'] = MIN_NUM_DOCS
if MAX_PCT_DOCS:
option_overrides['no_above'] = MAX_PCT_DOCS
if MAX_NUM_TOKENS:
option_overrides['keep_n'] = MAX_NUM_TOKENS
corpus_data.dictionary.filter_extremes(**option_overrides)
# Print the corpus
if (print_vectors and show_original):
print("corpus_data: [")
for docid, vector in enumerate(corpus_data):
print(docid, vector if not verbose_output else resolve_terms(vector, corpus_data.dictionary))
print("]")
# Do optional TF/IDF analysis
# Note: tfidf represents the transformation and tfidf_corpus the transformed corpus
# See http://radimrehurek.com/gensim/tut2.html#transformation-interface.
# TODO: put TFIDF support into CorpusData class
if (perform_tfidf):
tpo.debug_print("TF/IDF", 6)
if load:
tfidf_corpus = corpora.MmCorpus(output_basename + '.tfidf.mm')
else:
mm = list(corpus_data) if expand_corpus else corpus_data
## TODO: mm = corpora.MmCorpus(output_basename + '.bow.mm')
tfidf = models.TfidfModel(mm, id2word=corpus_data.dictionary, normalize=normalize)
tfidf_corpus = tfidf[corpus_data]
## TODO: tfidf = models.TfidfModel(corpus_data)
tpo.debug_print("tfidf: type=%s value=%s" % (type(tfidf_corpus), tfidf_corpus), 5)
# Print the TF/IDF version of the corpus
if (print_vectors):
print("tfidf corpus: [")
for docid, vector in enumerate(tfidf_corpus):
print(docid, vector if not verbose_output else resolve_terms(vector, corpus_data.dictionary))
print("]")
# Determine similarity model
if (derive_similarity or source_similar_docs):
assert load, "similarity support requires --load (to use Matrix Market format produced by --save)"
sim_corpus = tfidf_corpus if perform_tfidf else corpus_data.mm
dictionary = corpus_data.dictionary
index_filename = tpo.getenv_text("SIM_INDEX", output_basename + ".sim_index")
sim = SimilarDocumentByCosine(corpus=sim_corpus, dictionary=dictionary, index_file=index_filename, verbose_output=verbose_output, max_similar=max_similar, docid_filename=docid_filename)
# Precompute similarities
if not source_similar_docs:
sim.derive_all_similarities()
# Show similar documents
# TODO: have option to save as data file
if source_similar_docs:
if (source_similar_docs == ['*']):
similar_doc_info = sim.find_all_similar()
else:
similar_doc_info = []
for docid in source_similar_docs:
similar_doc_info.append((docid, sim.find(docid)))
if tpo.verbose_debugging():
similar_doc_info = list(similar_doc_info)
tpo.debug_format("similar_doc_info={similar_doc_info}")
for (docid, similar_docs) in similar_doc_info:
similar_docs = [(d, tpo.round_num(score)) for (d, score) in similar_docs]
print("Documents similar to %s: %s" % (docid, similar_docs))
## TODO: show overlapping terms in verbose mode
# Optionally save main components to disk
if (save):
# TODO: have corpus data save both the dictionary and the matrix
tpo.debug_print("saving corpora files", 6)
if ((not load) or (not gh.non_empty_file(output_basename + '.wordids.txt.bz2'))):
corpus_data.dictionary.save_as_text(output_basename + '.wordids.txt.bz2')
if ((not load) or (not gh.non_empty_file(output_basename + '.bow.mm'))):
mm = list(corpus_data) if expand_corpus else corpus_data
corpora.MmCorpus.serialize(output_basename + '.bow.mm', mm)
if (perform_tfidf):
if ((not load) or (not gh.non_empty_file(output_basename + '.tfidf.mm'))):
corpora.MmCorpus.serialize(output_basename + '.tfidf.mm', tfidf_corpus)
if (derive_similarity):
sim.save(output_basename + '.sim_index')
# Cleanup
if (temp_file and (not tpo.detailed_debugging())):
gh.run("rm -vf {temp_file}")
return
#------------------------------------------------------------------------
if __name__ == '__main__':
main()