Scripts/OOVs.py at master · Bilingual-Annotation-Task-Force/Scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""

Eric Nordstrom
Python 3.6.0
4/29/17

Removes out-of-vocabulary (OOV) words, a.k.a. "mixed words", from the provided series of
tokens. Words are deemed OOV when they are not found in either provided language dictionary.
Results are stored in .TXT file(s) specified by the user. PyDictionary option available for
English dictionary (requires PyDictionary module and reliable internet connection).

Example command line input:

    C:\Users\Me\Research\Files>..\Scripts\OOVs.py "Tokenized Corpus.txt" SpnDict . -d1 utf8

    Interpretation:
        ..\Scripts\OOVs.py          Call OOVs.py from separate directory
        "Tokenized Corpus.txt"      Corpus tokens data (quotes to avoid parsing argument)
        SpnDict                     Spanish dictionary (".txt" assumed)
        .                           PyDictionary option chosen for English dictionary
        -d1                         Spanish dictionary encoding type argument called
        utf8                        Spanish dictionary encoding type specification

"""


def PyDict(): #for default D2 argument in OOV_remove
    '''Returns PyDictionary object'''

    from PyDictionary import PyDictionary
    return PyDictionary()


def OOV_remove( tokens, D1, D2=PyDict() ):
    '''Removes OOVs from tokens list based on two dictionaries. PyDictionary module used for Dictionary 2 default.'''

    import string

    if type( D2 ) in { set, list, tuple, dict }:
        def condition3( word, D2 ): #condition for IF statement in FOR loop
            return word not in D2

    else: #assume PyDictionary

        def condition3( word, D2 ):
            return D2.meaning( word ) == None #This line would print to the console on each OOV if the STDOUT were not changed.

        import sys, os
        orig_stdout = sys.stdout #to save for later
        sys.stdout = open( os.devnull, 'w' ) #prevents printing to console during PyDictionary usage

    t = list( tokens ) #to become output tokens LIST with OOVs removed
    OOVs = {} #to become DICT containing removed OOVs hashed with their original indices in TOKENS
    d = 0 #index offset to account for already removed OOV words

    for i in range( 0, len(tokens) ):

        word = tokens[i]

        if word not in string.punctuation and word not in D1 and condition3( word, D2 ):
            OOVs.update({ i+1 : word }) #can remove "+1" after "i" on this line if zero-indexing desired.
            del t[i-d]
            d += 1

    if type( D2 ) not in { set, list, tuple, dict }:
        sys.stdout = orig_stdout #restore stdout

    return ( t, OOVs )


def gettxt( file_name, encoding_type=None ):
    '''Reads and splits .TXT files. Appends ".txt" to file name if necessary.'''

    name = file_name

    if name[-4:] != ".txt":
        name += ".txt"

    return open( name, encoding=encoding_type ).read().split() #LIST type


def get_answer(prompt, accepted_answers, answer_type = str):
    '''Loops until input is an accepted answer'''

    answer = 'a;sdlkfha;oiwefhdnfaf;we'

    while answer not in accepted_answers:
        answer = answer_type( input( prompt ) )
        if answer.lower() not in accepted_answers:
            print( '"%s" is not an accepted response.' % str( answer ) )

    return answer


def destwrite( words, help_message ):
    '''User interface for writing to .TXT files. Does not return anything.'''

    destname = input( '\nInput destination .TXT file name ("\\H" for help): ' )
    h = True

    if destname.lower() == "\\h":
        print( help_message )
        destname = input( "\nInput destination .TXT file name: " )
        h = False

    option = 'n'
    sep = False #used for "append" case

    while option in { 'c', 'n' }: #determine how to open file

        if destname[-4:] != ".txt":
            destname += ".txt"

        try: #User should preferably type a file name that does not already exist, in which case this block is not necessary.

            dest = open( destname, 'r' )
            print( "\nFile by that name already exists." )
            prompt = 'Options:\n\t"O" - overwrite contents\n\t"A" - append to contents\n\t"C" - create new file with "(1)" appended to name\n\t"N" - enter new name\n\t[ctrl]+[C] - exit\n\nInput: '
            accepted_answers = { 'o', 'a', 'c', 'n', '\h' }
            option = get_answer( prompt, accepted_answers ).lower()

            if option == 'o':
                print( '\nOverwriting "%s".' % destname )
                dest = open( destname, 'w' )
            elif option == 'a':
                print( '\nAppending to "%s".' % destname )
                dest = open( destname, 'a' )
                sep = True
            elif option == 'c':
                destname = destname[:-4] + " (1)"

            elif option == 'n':
                destname = input( "\nInput destination .TXT file name%s: " % ( ' ("\\H" for help)' * h ) )

            else:
                print( help_message )
                destname = input( "\nInput destination .TXT file name: " )
                h = False

        except FileNotFoundError: #Preferred block

            option = '' #to exit WHILE loop
            print( '\nCreating and writing to new file "%s".' % destname )
            dest = open( destname, 'w' )

    dest.write( "\n"*9*sep ) #for "append" case

    for i in words:

            dest.write( str( i ) )

            if type( words ) == dict: #OOVs
                dest.write( " : " + words[i] )

            dest.write( "\n" )

    dest.close()
    print( "Writing complete. File saved." )

def main():
    import argparse
    parser = argparse.ArgumentParser( description = 'Locate, remove, and record out-of-vocabulary (OOV) words, a.k.a. "mixed words"' )

    parser.add_argument( "TOKENS", help="Name of the .TXT file containing corpus tokens." )
    parser.add_argument( "D1", help="Name of the language 1 dictionary .TXT file" )
    parser.add_argument( "D2", help='Name of the language 2 dictionary .TXT file. Enter "." for PyDictionary (requires PyDictionary module and reliable internet connection). NOTE: PyDictionary only for English; English dictionary must be D2 if using PyDictionary.' )
    parser.add_argument( "-t", "--TOKENS_encoding", help="Tokens .TXT file encoding type. Default used if not specified." )
    parser.add_argument( "-d1", "--D1_encoding", help="Language 1 dictionary .TXT file encoding type. Default used if not specified." )
    parser.add_argument( "-d2", "--D2_encoding", help="Language 2 dictionary .TXT file encoding type. Default used if not specified." )
    parser.add_argument( "-cd", "--change_directory", help='Change the folder in which to locate .TXT files. NOTE: It is also possible to specify individual file locations by including the entire path starting from "C:\".' )

    args = parser.parse_args()

    if args.change_directory:
        import os
        os.chdir( args.change_directory )

    tokens = gettxt( args.TOKENS, args.TOKENS_encoding )
    D1 = gettxt( args.D1, args.D1_encoding )

    if args.D2 == ".":

        if args.D2_encoding:
            raise RuntimeError( "Both PyDictionary option and encoding type specified for D2." )

        D2 = PyDict()

    else:
        D2 = gettxt( args.D2, args.D2_encoding )

    print( "\nRemoving OOVs...\n" )
    ( tokens_without_OOVs, OOVs ) = OOV_remove( tokens, D1, D2 )
    print( "\nOOVs removed.\n" )

    help_message = '\nDestination .TXT file used to store tokens list after removing out-of-vocabulary (OOV) words, a.k.a. "mixed words". If destination file to be outside of current working directory, include file location path in name.'
    destwrite( tokens_without_OOVs, help_message )

    prompt = "\nWrite removed OOVs to .TXT file? (Y/N): "
    accepted_answers = { 'y', 'n' }
    keep_OOVs = get_answer( prompt, accepted_answers )

    if keep_OOVs.lower() == 'y':
        help_message = '\nDestination .TXT file used to store removed out-of-vocabulary (OOV) words, a.k.a. "mixed words", and their corresponding locations in the original tokens list. If destination file to be outside of current working directory, include file location path in name.'
        destwrite( OOVs, help_message )

    print( "\nDone." )

if __name__ == "__main__":
    main()