-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathOOVs.py
More file actions
209 lines (142 loc) · 8.38 KB
/
Copy pathOOVs.py
File metadata and controls
209 lines (142 loc) · 8.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""
Eric Nordstrom
Python 3.6.0
4/29/17
Removes out-of-vocabulary (OOV) words, a.k.a. "mixed words", from the provided series of
tokens. Words are deemed OOV when they are not found in either provided language dictionary.
Results are stored in .TXT file(s) specified by the user. PyDictionary option available for
English dictionary (requires PyDictionary module and reliable internet connection).
Example command line input:
C:\Users\Me\Research\Files>..\Scripts\OOVs.py "Tokenized Corpus.txt" SpnDict . -d1 utf8
Interpretation:
..\Scripts\OOVs.py Call OOVs.py from separate directory
"Tokenized Corpus.txt" Corpus tokens data (quotes to avoid parsing argument)
SpnDict Spanish dictionary (".txt" assumed)
. PyDictionary option chosen for English dictionary
-d1 Spanish dictionary encoding type argument called
utf8 Spanish dictionary encoding type specification
"""
def PyDict(): #for default D2 argument in OOV_remove
'''Returns PyDictionary object'''
from PyDictionary import PyDictionary
return PyDictionary()
def OOV_remove( tokens, D1, D2=PyDict() ):
'''Removes OOVs from tokens list based on two dictionaries. PyDictionary module used for Dictionary 2 default.'''
import string
if type( D2 ) in { set, list, tuple, dict }:
def condition3( word, D2 ): #condition for IF statement in FOR loop
return word not in D2
else: #assume PyDictionary
def condition3( word, D2 ):
return D2.meaning( word ) == None #This line would print to the console on each OOV if the STDOUT were not changed.
import sys, os
orig_stdout = sys.stdout #to save for later
sys.stdout = open( os.devnull, 'w' ) #prevents printing to console during PyDictionary usage
t = list( tokens ) #to become output tokens LIST with OOVs removed
OOVs = {} #to become DICT containing removed OOVs hashed with their original indices in TOKENS
d = 0 #index offset to account for already removed OOV words
for i in range( 0, len(tokens) ):
word = tokens[i]
if word not in string.punctuation and word not in D1 and condition3( word, D2 ):
OOVs.update({ i+1 : word }) #can remove "+1" after "i" on this line if zero-indexing desired.
del t[i-d]
d += 1
if type( D2 ) not in { set, list, tuple, dict }:
sys.stdout = orig_stdout #restore stdout
return ( t, OOVs )
def gettxt( file_name, encoding_type=None ):
'''Reads and splits .TXT files. Appends ".txt" to file name if necessary.'''
name = file_name
if name[-4:] != ".txt":
name += ".txt"
return open( name, encoding=encoding_type ).read().split() #LIST type
def get_answer(prompt, accepted_answers, answer_type = str):
'''Loops until input is an accepted answer'''
answer = 'a;sdlkfha;oiwefhdnfaf;we'
while answer not in accepted_answers:
answer = answer_type( input( prompt ) )
if answer.lower() not in accepted_answers:
print( '"%s" is not an accepted response.' % str( answer ) )
return answer
def destwrite( words, help_message ):
'''User interface for writing to .TXT files. Does not return anything.'''
destname = input( '\nInput destination .TXT file name ("\\H" for help): ' )
h = True
if destname.lower() == "\\h":
print( help_message )
destname = input( "\nInput destination .TXT file name: " )
h = False
option = 'n'
sep = False #used for "append" case
while option in { 'c', 'n' }: #determine how to open file
if destname[-4:] != ".txt":
destname += ".txt"
try: #User should preferably type a file name that does not already exist, in which case this block is not necessary.
dest = open( destname, 'r' )
print( "\nFile by that name already exists." )
prompt = 'Options:\n\t"O" - overwrite contents\n\t"A" - append to contents\n\t"C" - create new file with "(1)" appended to name\n\t"N" - enter new name\n\t[ctrl]+[C] - exit\n\nInput: '
accepted_answers = { 'o', 'a', 'c', 'n', '\h' }
option = get_answer( prompt, accepted_answers ).lower()
if option == 'o':
print( '\nOverwriting "%s".' % destname )
dest = open( destname, 'w' )
elif option == 'a':
print( '\nAppending to "%s".' % destname )
dest = open( destname, 'a' )
sep = True
elif option == 'c':
destname = destname[:-4] + " (1)"
elif option == 'n':
destname = input( "\nInput destination .TXT file name%s: " % ( ' ("\\H" for help)' * h ) )
else:
print( help_message )
destname = input( "\nInput destination .TXT file name: " )
h = False
except FileNotFoundError: #Preferred block
option = '' #to exit WHILE loop
print( '\nCreating and writing to new file "%s".' % destname )
dest = open( destname, 'w' )
dest.write( "\n"*9*sep ) #for "append" case
for i in words:
dest.write( str( i ) )
if type( words ) == dict: #OOVs
dest.write( " : " + words[i] )
dest.write( "\n" )
dest.close()
print( "Writing complete. File saved." )
def main():
import argparse
parser = argparse.ArgumentParser( description = 'Locate, remove, and record out-of-vocabulary (OOV) words, a.k.a. "mixed words"' )
parser.add_argument( "TOKENS", help="Name of the .TXT file containing corpus tokens." )
parser.add_argument( "D1", help="Name of the language 1 dictionary .TXT file" )
parser.add_argument( "D2", help='Name of the language 2 dictionary .TXT file. Enter "." for PyDictionary (requires PyDictionary module and reliable internet connection). NOTE: PyDictionary only for English; English dictionary must be D2 if using PyDictionary.' )
parser.add_argument( "-t", "--TOKENS_encoding", help="Tokens .TXT file encoding type. Default used if not specified." )
parser.add_argument( "-d1", "--D1_encoding", help="Language 1 dictionary .TXT file encoding type. Default used if not specified." )
parser.add_argument( "-d2", "--D2_encoding", help="Language 2 dictionary .TXT file encoding type. Default used if not specified." )
parser.add_argument( "-cd", "--change_directory", help='Change the folder in which to locate .TXT files. NOTE: It is also possible to specify individual file locations by including the entire path starting from "C:\".' )
args = parser.parse_args()
if args.change_directory:
import os
os.chdir( args.change_directory )
tokens = gettxt( args.TOKENS, args.TOKENS_encoding )
D1 = gettxt( args.D1, args.D1_encoding )
if args.D2 == ".":
if args.D2_encoding:
raise RuntimeError( "Both PyDictionary option and encoding type specified for D2." )
D2 = PyDict()
else:
D2 = gettxt( args.D2, args.D2_encoding )
print( "\nRemoving OOVs...\n" )
( tokens_without_OOVs, OOVs ) = OOV_remove( tokens, D1, D2 )
print( "\nOOVs removed.\n" )
help_message = '\nDestination .TXT file used to store tokens list after removing out-of-vocabulary (OOV) words, a.k.a. "mixed words". If destination file to be outside of current working directory, include file location path in name.'
destwrite( tokens_without_OOVs, help_message )
prompt = "\nWrite removed OOVs to .TXT file? (Y/N): "
accepted_answers = { 'y', 'n' }
keep_OOVs = get_answer( prompt, accepted_answers )
if keep_OOVs.lower() == 'y':
help_message = '\nDestination .TXT file used to store removed out-of-vocabulary (OOV) words, a.k.a. "mixed words", and their corresponding locations in the original tokens list. If destination file to be outside of current working directory, include file location path in name.'
destwrite( OOVs, help_message )
print( "\nDone." )
if __name__ == "__main__":
main()