-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathquery_grampat.py
187 lines (159 loc) · 9.25 KB
/
query_grampat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import dill
import operator
from collections import defaultdict
def load_files():
with open('bnc.grampat.dill', 'rb') as f:
bnc = dill.load(f)
with open('clcfce.grampat.dill', 'rb') as f:
clcfce = dill.load(f)
with open('efcamdat.grampat.dill', 'rb') as f:
efcamdat = dill.load(f)
with open('lang8.grampat.dill', 'rb') as f:
lang8 = dill.load(f)
return bnc, clcfce, efcamdat, lang8
def get_head_stpat_dict(count_dict):
"""
Inverse the `count_dict` [src_pat][tgt_pat][headword] to [head][src_pat][tgt_pat].
"""
head_stpat_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
for src_pat, tgt_dict in count_dict.items():
for tgt_pat, head_dict in tgt_dict.items():
for head, count in head_dict.items():
if count:
# src -> tgt
head_stpat_dict[head][src_pat][tgt_pat] = count
head_stpat_dict['*'][src_pat][tgt_pat] += count
head_stpat_dict[head]['*'][tgt_pat] += count
head_stpat_dict[head][src_pat]['*'] += count
return head_stpat_dict
def get_inconsistent_dict(head_stpat_dict):
"""
Find inconsistent grammar patterns for each headword, for example,
there exists either `src_pat` -> `tgt_pat` or `tgt_pat` -> `src_pat` for the specific headword.
"""
# These preps are copied from Jason's `grampat.py`
pgPreps = 'in_favor_of|_|about|after|against|among|as|at|between|behind|by|for|from|in|into|of|\
on|upon|over|through|to|towards|toward|under|with'.split('|')
otherPreps ='out|'.split('|')
Preps = set(pgPreps + otherPreps)
inconsistent_dict = defaultdict(list)
for head in head_stpat_dict.keys():
if head == '*': continue
visited_pats = []
for src_pat in head_stpat_dict[head].keys():
for tgt_pat in head_stpat_dict[head][src_pat].keys():
# Find parallel grammars exists in either src->tgt or tgt->src (inconsistency)
# And only keep parallel changes involved preposition words
if src_pat != '*' and tgt_pat != '*' and src_pat != tgt_pat \
and head_stpat_dict[head].get(tgt_pat) and head_stpat_dict[head][tgt_pat].get(src_pat)\
and {src_pat, tgt_pat} not in visited_pats\
and set(src_pat.split()).symmetric_difference(set(tgt_pat.split()))\
and not set(src_pat.split()).symmetric_difference(set(tgt_pat.split())) - Preps:
inconsistent_dict[head].append({
f'{src_pat} -> {tgt_pat}': head_stpat_dict[head][src_pat][tgt_pat],
f'{tgt_pat} -> {src_pat}': head_stpat_dict[head][tgt_pat][src_pat],
f'{src_pat} -> {src_pat}': head_stpat_dict[head][src_pat].get(src_pat, 0),
f'{tgt_pat} -> {tgt_pat}': head_stpat_dict[head][tgt_pat].get(tgt_pat, 0)
})
visited_pats.append({src_pat, tgt_pat})
return inconsistent_dict
if __name__ == '__main__':
"""
Data structure of dill object:
- `count_dict` (3-nested dict):
- key1: source grammar pattern (str), e.g., "V about n"
- key2: target grammar pattern (str), e.g., "V n
- key3: headword in uppercase (str), e.g., "DISCUSS"
- value: count
- `ngram_dict` (3-nested dict):
- key1: source grammar pattern (str), e.g., "V about n"
- key2: target grammar pattern (str), e.g., "V n
- key3: headword in uppercase (str), e.g., "DISCUSS"
- key4: tuple (source ngram, target ngram),
e.g., ("discuss about something", "discuss something")
- value: count
Notes:
- Current dictionaries are queried by parallel grammar patterns.
If you want to query by `headword`, use `get_head_stpat_dict()`.
- BNC grampat's source & target grammar pattern will be the same.
"""
#---------------------------------------------------------------------------
# 1. Load files
#---------------------------------------------------------------------------
bnc, clcfce, efcamdat, lang8 = load_files()
#---------------------------------------------------------------------------
# 2. Show top 5 headwords of 'V about n' -> 'V n'
# Here we show headwords in EFCAMDAT, you can query another corpora as well.
#---------------------------------------------------------------------------
topk = 5
src_pat = 'V about n'
tgt_pat = 'V n'
print('Top {} headwords that exists "{} -> {}" in EFCAMDAT:'.format(topk, src_pat, tgt_pat))
print(sorted(efcamdat['count_dict'][src_pat][tgt_pat].items(), key=operator.itemgetter(1), reverse=True)[:topk])
print()
#---------------------------------------------------------------------------
# 3. Show top 5 ngram examples of 'DISCUSS about n' -> 'DISCUSS n'
# Here we show headwords in EFCAMDAT, you can query another corpora as well.
#---------------------------------------------------------------------------
topk = 5
src_pat = 'V about n'
tgt_pat = 'V n'
head = 'DISCUSS'
print('Top {} n-gram examples of "{} -> {}" of the headword "{}" in EFCAMDAT:'.format(topk, src_pat, tgt_pat, head))
for ((src_ngram, tgt_ngram), count) in sorted(efcamdat['ngram_dict'][src_pat][tgt_pat][head].items(),\
key=operator.itemgetter(1), reverse=True)[:5]:
print('{} -> {}: {}'.format(src_ngram, tgt_ngram, count))
print()
#---------------------------------------------------------------------------
# 4. Show counts of parallel grammar pattern of the specific headword.
# Note that since BNC is a monolingual corpus, thus `src_pat` == `tgt_pat`.
#---------------------------------------------------------------------------
src_pat = 'V about n'
tgt_pat = 'V n'
head = 'DISCUSS'
print('Grammar pattern counts of "{}" in BNC:'.format(head))
print('{} -> {}: {}'.format(src_pat, src_pat, bnc['count_dict'][src_pat][src_pat][head]))
print('{} -> {}: {}'.format(src_pat, tgt_pat, bnc['count_dict'][src_pat][tgt_pat][head]))
print('{} -> {}: {}'.format(tgt_pat, src_pat, bnc['count_dict'][tgt_pat][src_pat][head]))
print('{} -> {}: {}'.format(tgt_pat, tgt_pat, bnc['count_dict'][tgt_pat][tgt_pat][head]))
print()
print('Grammar pattern counts of "{}" in CLC-FCE:'.format(head))
print('{} -> {}: {}'.format(src_pat, src_pat, clcfce['count_dict'][src_pat][src_pat][head]))
print('{} -> {}: {}'.format(src_pat, tgt_pat, clcfce['count_dict'][src_pat][tgt_pat][head]))
print('{} -> {}: {}'.format(tgt_pat, src_pat, clcfce['count_dict'][tgt_pat][src_pat][head]))
print('{} -> {}: {}'.format(tgt_pat, tgt_pat, clcfce['count_dict'][tgt_pat][tgt_pat][head]))
print()
print('Grammar pattern counts of "{}" in EFCAMDAT:'.format(head))
print('{} -> {}: {}'.format(src_pat, src_pat, efcamdat['count_dict'][src_pat][src_pat][head]))
print('{} -> {}: {}'.format(src_pat, tgt_pat, efcamdat['count_dict'][src_pat][tgt_pat][head]))
print('{} -> {}: {}'.format(tgt_pat, src_pat, efcamdat['count_dict'][tgt_pat][src_pat][head]))
print('{} -> {}: {}'.format(tgt_pat, tgt_pat, efcamdat['count_dict'][tgt_pat][tgt_pat][head]))
print()
print('Grammar pattern counts of "{}" in LANG8:'.format(head))
print('{} -> {}: {}'.format(src_pat, src_pat, lang8['count_dict'][src_pat][src_pat][head]))
print('{} -> {}: {}'.format(src_pat, tgt_pat, lang8['count_dict'][src_pat][tgt_pat][head]))
print('{} -> {}: {}'.format(tgt_pat, src_pat, lang8['count_dict'][tgt_pat][src_pat][head]))
print('{} -> {}: {}'.format(tgt_pat, tgt_pat, lang8['count_dict'][tgt_pat][tgt_pat][head]))
print()
#---------------------------------------------------------------------------
# 5. Find inconsistent parallel grammar patterns for every headword
# that only involve changes of preposition.
#---------------------------------------------------------------------------
efcamdat['head_stpat_dict'] = get_head_stpat_dict(efcamdat['count_dict'])
efcamdat['inconsistent_dict'] = get_inconsistent_dict(efcamdat['head_stpat_dict'])
clcfce['head_stpat_dict'] = get_head_stpat_dict(clcfce['count_dict'])
clcfce['inconsistent_dict'] = get_inconsistent_dict(clcfce['head_stpat_dict'])
lang8['head_stpat_dict'] = get_head_stpat_dict(lang8['count_dict'])
lang8['inconsistent_dict'] = get_inconsistent_dict(lang8['head_stpat_dict'])
#---------------------------------------------------------------------------
# 6. Show inconsistent parallel grammar patterns of the headword "DISCUSS"
# in EFCAMDAT.
#---------------------------------------------------------------------------
head = 'DISCUSS'
print('Inconsistent parallel grammar patterns of the headword "{}" in EFCAMDAT:'.format(head))
for i, inconsistent_case in enumerate(efcamdat['inconsistent_dict'][head]):
print('Case {}:'.format(i+1))
for parallel_grampat, count in inconsistent_case.items():
print('{}: {}'.format(parallel_grampat, count))
print()
print()