-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_wiki_texts.py
59 lines (49 loc) · 1.82 KB
/
extract_wiki_texts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#! /usr/bin/env python
#
# extract_wiki_token.py: Extracts tokens from each of the articles in a
# Wikipedia dump
#
#
"""Extraction wikipedia article tokens"""
import re
import sys
from gensim.corpora import WikiCorpus
import debug
import system
INCLUDE_TEMPLATES = system.getenv_bool("INCLUDE_TEMPLATES", False)
NORMALIZE_TITLE = system.getenv_bool("NORMALIZE_TITLE", False)
def main(args=None):
"""Entry point for script"""
debug.trace_fmtd(4, "main(): args={a}", a=args)
if args is None:
args = sys.argv
if len(args) <= 1:
system.print_stderr("{f}:main: need to supply wiki dump filename".
format(f=(__file__ or "n/a")))
return
filename = args[1]
# Open Wikipedia corpus but block unnecessary dictionary-building pass
dummy_dict = {1: 'one'}
wiki = WikiCorpus(filename, dictionary=dummy_dict)
wiki.metadata = True
# Print title and tokens in tabular format
for (tokens, (_pageid, title)) in wiki.get_texts():
## BAD: output = "{title}\t{text}".format(title=title, text=(" ".join(tokens)))
# Filter templates
if (title.startswith("template:") and (not INCLUDE_TEMPLATES)):
debug.trace_fmtd(4, "Ignoring template article {t}", t=title)
continue
# Make sure title uses underscores for spaces, omitting extraneous ones
if NORMALIZE_TITLE:
title = re.sub("(^ +)|( +$)", title, "")
title = title.replace(" ", "_")
# Output the article
try:
output = title + "\t" + (" ".join(tokens))
print(system.to_utf8(output))
except:
system.print_stderr("Problem with " + str(_pageid))
return
#------------------------------------------------------------------------
if __name__ == '__main__':
main()