-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathanaliselexica.py
274 lines (230 loc) · 11.1 KB
/
analiselexica.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import re
import ply.lex as lex
states = (
('ptsearch','exclusive'),
)
# List of token names.
tokens = (
'initial_letter', #is when we have a letter wihch finishes with '\n', example: A
'baseword', #is the +base word, always appears in the beggining of a new line followed by the character ':'
'baseword_error', #example: worth
'prefix_word', #is the type of the base word, always appears in a new line if the line contains the character '-' before the word
'prefix_word_error', # -, insurance and freight
'prefix_word_error_2', # - volume ratio (P/V)\n
'middle1_word', #is the type of the base word, always appears in a new line if the line contains the character '-' in the middle the word
'middle1_word_error', #example: sales -\n
'middle1_word_error_2', #example: down.the -
'middle2_word', #is the type of the base word, always appears in a new line if the line contains the character '-' in the middle the word
'middle2_word_error', # example: management information -\n
'middle_word_5', # source and - of funds
'suffix_word', #is the type of the base word, always appears in a new line if the line contains the character '-' after the word
'suffix_error', #automatic data (ADP)-
'double_word', #example: - base -
'prefix_error_word', # example: -structuring
'middle_error_word', #exameple: semi-costs
'suffix_error_word', # example: shift-
'abbreviation', #sigla ex: (PMTS)
'no_hifen', #example: engineering
'no_hifen_paragraph', # return on capital\n
'normalword', # an english word to be translated (doesn't have a type)
'a_parenteses', # CWM (clerical work
'a_parenteses_paragraph', # ROCE (return on capital\n
'f_parenteses', # measurement)
'portugueseTranslationError', # example: treinamento (mj dentro da indústria
'portugueseTranslation', #the portuguese sentence appears after more than \t before the end of the current line
'paragraph', # token extra
)
def t_paragraph(t):
r'\n'
t.lexer.lineno += 1
pass
def t_initial_letter(t): #example: A\n
r'\w[ \r\t\f]*\n'
t.lexer.lineno += 1
t.value = t.value.strip()
return t
def t_normalword(t):#example: yearly report OU I.O.U. (I owe you) OU buyers's market OU cost-volume-analysis
r'\w[\w\'\-\.\']*([ \r\t\f]\w[\w\-]*)*([ \r\t\f]\([^\)\n]*\))?[ \r\t\f]{3}[ \r\t\f]*'
t.lexer.push_state('ptsearch')
t.value = t.value.strip()
t.lexer.word = t.value
return t
def t_baseword(t): #example: automatic data: (espaços a seguir de :)'\n'(noutro token) OU administration: ( pode ter tradução a seguir: "administração (f)" )
r'[ \r\t\f]*\w[ \r\t\f\w\-]*:'
t.lexer.push_state('ptsearch')
t.value = t.value.strip(':')
t.lexer.word = t.value
return t
# Falta os : no final
def t_baseword_error(t): #example: worth\n
r'[ \r\t\f]*\w\w+\n'
t.lexer.lineno += 1
t.value = t.value.strip()
t.lexer.word = t.value
return t
# The third word is facultative!!!!!
# Pode incluir outros casos como: to-rule (to -)
def t_prefix_word(t): #example: - of responsibilities (ROF) OR - of responsibilities rof (ROFR) OR - to-rule (to -)
r'[ \r\t\f]*-[ \r\t\f]+\w[\w\-\,]*([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\([^\)]*\))?[ \r\t\f]{3}[ \r\t\f]*'
t.lexer.push_state('ptsearch')
t.value = re.sub("[ \r\t\f]*\-[ \r\t\f]+", t.lexer.word + " ", t.value)
t.value = re.sub("[ \r\t\f]\-", " " + t.lexer.word, t.value)
t.value = t.value.strip()
# print(t.value)
return t
def t_prefix_word_error(t): # example: -, insurance and freight
r'[ \r\t\f]*\-\,[ \r\t\f]\w[\w\-\,]*([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\([^\)]*\))?[ \r\t\f]{3}[ \r\t\f]*'
t.lexer.push_state('ptsearch')
t.value = re.sub("[ \r\t\f]*\-", t.lexer.word, t.value).strip()
return t
def t_prefix_word_error_2(t): # - volume ratio (P/V)\n
r'[ \r\t\f]*-[ \r\t\f]\w[\w\,]*([ \r\t\f]\w[\w\-\,]*)*([ \r\t\f]\([^\)]*\))?\n'
t.lexer.lineno += 1
t.value = re.sub("[ \r\t\f]*\-[ \r\t\f]+", t.lexer.word + " ", t.value)
t.value = t.value.strip()
return t
def t_middle1_word(t): #example: value - tax (VAT) OR value - (VA) OR value - Tax Tax (VATT) OR buyers' -
r'[ \r\t\f]*\w[\w\-\,\']*[ \r\t\f]-([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\([^\)]*\))?[ \r\t\f]{3}[ \r\t\f]*'
t.lexer.push_state('ptsearch')
t.value = re.sub("[ \r\t\f]*\-[ \r\t\f]+", " " + t.lexer.word + " ", t.value).strip()
return t
def t_middle1_word_error(t): #example: sales -\n
r'[ \r\t\f]*\w[\w\-\,]*[ \r\t\f]-([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\([^\)]*\))?\n'
t.lexer.lineno += 1
t.value = re.sub("[ \r\t\f]*\-", t.lexer.word, t.value).strip()
return t
# ERRO DE FORMATO
def t_middle1_word_error_2(t): # down.the -
r'[ \r\t\f]*\w[\w\-\,\.]*[ \r\t\f]-([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\([^\)]*\))?[ \r\t\f]{3}[ \r\t\f]*'
print("\n!!! Detetado erro de formato na linha " + str(t.lexer.lineno) + " !!!\n")
t.lexer.push_state('ptsearch')
t.value = re.sub("[ \r\t\f]*\-", t.lexer.word, t.value)
t.value = re.sub("\.", " ", t.value).strip()
return t
def t_middle2_word(t): #example: value tax - (VAT) OR value Tax - tax (VTAT) OU quality (QC) -
r'[ \r\t\f]*\w[\w\-\,]*[ \r\t\f][\w\(\)][\w\-\,\)]*[ \r\t\f]-([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\([^\)]*\))?[ \r\t\f]{3}[ \r\t\f]*'
t.lexer.push_state('ptsearch')
t.value = re.sub("[ \r\t\f]*\-[ \r\t\f]+", " " + t.lexer.word + " ", t.value).strip()
return t
def t_middle2_word_error(t): # example: management information -\n OU # return on -\n
r'[ \r\t\f]*\w[\w\-\,]*[ \r\t\f]\w[\w\-\,]*[ \r\t\f]-([ \r\t\f]\w[\w\-\,]*)?([ \r\t\f]\([^\)]*\))?\n'
t.lexer.lineno += 1
t.value = re.sub("[ \r\t\f]*\-", " " + t.lexer.word + " ", t.value).strip()
return t
def t_middle_word_5(t): # source and - of funds
r'[ \r\t\f]*\w[\w\-\,]*[ \r\t\f]\w[\w\-\,]*[ \r\t\f]-[ \r\t\f]\w[\w\-\,]*[ \r\t\f]\w[\w\-\,]*[ \r\t\f]{3}[ \r\t\f]*'
t.lexer.push_state('ptsearch')
t.value = re.sub("[ \r\t\f]*\-", " " + t.lexer.word + " ", t.value).strip()
return t
def t_suffix_word(t): #example: value tax final - (VTFA)
r'[ \r\t\f]*\w[\w\-\,]*[ \r\t\f]\w[\w\-\,]*[ \r\t\f]\w[\w\-\,]*[ \r\t\f]-([ \r\t\f]\([^\)]*\))?[ \r\t\f]{3}[ \r\t\f]*'
t.lexer.push_state('ptsearch')
t.value = re.sub("[ \r\t\f]*\-[ \r\t\f]*", " " + t.lexer.word + " ", t.value).strip()
return t
# ERRO de FORMATO
def t_suffix_error(t): #automatic data (ADP)-
r'[ \r\t\f]*\w[\w\-\,]*[ \r\t\f]\w[\w\-\,]*[ \r\t\f]\(\w[\w\-\,]*\)-([ \r\t\f]\([^\)]*\))?[ \r\t\f]{3}[ \r\t\f]*'
print("\n!!! Detetado erro de formato na linha " + str(t.lexer.lineno) + " !!!\n")
t.lexer.push_state('ptsearch')
list = t.value.split('(')
t.value = list[0] + t.lexer.word + " " + re.sub("\-[ \r\t\f]+", "", t.value).strip()
return t
def t_double_word(t): #example: - base - OU price - earnings - (PIE)
r'[ \r\t\f]*(\w[\w\-\,]*[ \r\t\f])?\-[ \r\t\f]\w[\w\-\,]*[ \r\t\f]\-([ \r\t\f]\([^\)]*\))?[ \r\t\f]{3}[ \r\t\f]*'
t.lexer.push_state('ptsearch')
t.value = re.sub("\-[ \r\t\f]", t.lexer.word + " ", t.value)
t.value = re.sub("[ \r\t\f]\-[ \r\t\f]", " " + t.lexer.word + " ", t.value).strip()
return t
# ERRO DE FORMATO
def t_prefix_error_word(t) : # example: -structuring
r'[ \r\t\f]*\-\w[\w\,]*([ \r\t\f]\w[\w\-\,]*)*([ \r\t\f]\([^\)]*\))?[ \r\t\f]{3}[ \r\t\f]*'
print("\n!!! Detetado erro de formato na linha " + str(t.lexer.lineno) + " !!!\n")
t.lexer.push_state('ptsearch')
t.value = re.sub("[ \r\t\f]*\-", t.lexer.word + " ", t.value).strip()
return t
def t_middle_error_word(t): #exameple: semi-costs
r'[ \r\t\f]*\w[\w\,]*\-\w[\w\,]*([ \r\t\f]\w[\w\-\,]*)*([ \r\t\f]\([^\)]*\))?[ \r\t\f]{3}[ \r\t\f]*'
t.lexer.push_state('ptsearch')
t.value = re.sub("\-", "-" + t.lexer.word + " ", t.value).strip()
return t
def t_suffix_error_word(t): #example: shift- OU resale price-(RPM) OU self-
r'[ \r\t\f]*(\w[\w\,]*[ \r\t\f])?\w[\w\,]*-([ \r\t\f]\w[\w\-,]*)*(\(\w*\))*([ \r\t\f]\([^\)]*\))?[ \r\t\f]{3}[ \r\t\f]*'
t.value = re.sub("\-\s", "-" + t.lexer.word + " ", t.value)
t.value = re.sub("\-\(", "-" + t.lexer.word + " (", t.value)
t.value = t.value.strip()
t.lexer.push_state('ptsearch')
return t
# Sigla - Acronim
def t_abbreviation(t): # (PMTS) OU (O and M) OU (PPBS)
r'[ \r\t\f]*\(\w[\w, \r\t\f]*\)[ \r\t\f]*'
t.value = t.value.strip()
t.lexer.push_state('ptsearch')
return t
# abrir parenteses
def t_a_parenteses(t): # CWM (clerical work OU EEC (European Economic Com- OU R and D (research and
r'[ \r\t\f]*(\w+[ \r\t\f])+\((\w[\w\,]*([ \r\t\f\-])?)+[ \r\t\f]{3}[ \r\t\f]*'
t.value = t.value.strip()
t.lexer.push_state('ptsearch')
return t
# com paragrafo no final
def t_a_parenteses_paragraph(t): # ROCE (return on capital\n
r'[ \r\t\f]*(\w+[ \r\t\f])+\((\w[\w\,]*([ \r\t\f\-])?)+\n'
t.lexer.lineno += 1
t.value = t.value.strip()
return t
# fechar parenteses
def t_f_parenteses(t): # measurement)
r'[ \r\t\f]*\w+(([ \r\t\f]\w+)*)?\)[ \r\t\f]*'
t.value = t.value.strip()
t.lexer.push_state('ptsearch')
return t
def t_no_hifen(t): # diferença em relação ao normalword: contém espaço no início
r'[ \r\t\f]+\(?\w[\w\,\-]*([ \r\t\f]\w[\w\-\,]*)*([ \r\t\f]\([^\)]*\))?\)?[ \r\t\f]{3}[ \r\t\f]*'
t.value = t.value.strip()
t.lexer.push_state('ptsearch')
return t
def t_no_hifen_paragraph(t): # return on capital\n
r'[ \r\t\f]*\w[\w\,]*([ \r\t\f]\w[\w\-\,]*)*([ \r\t\f]\([^\)]*\))?\n'
t.value = t.value.strip()
t.lexer.lineno += 1
return t
'''
performance - controle (m) orçamentário de rendimen-
to
'''
# ERRO DE FORMATO
def t_ptsearch_portugueseTranslationError(t): # example: treinamento (mj dentro da indústria
r'[^\n]*\(mj[^\n]*\n([ \r\t\f]{12}[^\n]*\n)*'
print("\n!!! Detetado erro de formato na linha " + str(t.lexer.lineno) + " !!!\n")
t.lexer.pop_state()
t.lexer.lineno += str(t.value).count('\n')
t.value = re.sub(r"\(mj", "(m)", t.value)
t.value = t.value.strip()
return t
def t_ptsearch_portugueseTranslation(t): # includes () í ,
r'[^\n]*\n([ \r\t\f]{12}[^\n]*\n)*'
t.lexer.pop_state()
t.lexer.lineno += str(t.value).count('\n')
list = t.value.split()
t.value = ""
for l in list:
if l[len(l) - 1] == '-':
l = l[:-1]
elif t.value != "": # empty string
t.value += " "
t.value += l.strip()
return t
t_ANY_ignore = ""
def t_ANY_error(t):
print(f"Carácter ilegal '{t.value[0]}' na linha {t.lineno}")
t.lexer.lineno += str(t.value).count('\n')
t.lexer.skip(1)
lexer = lex.lex()
lexer.word = None # por agora, não tem palavra para substituição
'''
with open('dic-finance-en.pt.txt', 'r') as file:
data = file.read()
lexer.input(data)
while tok := lexer.token():
print(tok)
'''