-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_yahoo_data.py
91 lines (76 loc) · 3.02 KB
/
read_yahoo_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import argparse
from lxml import etree
import json
import os
# Create the parser and add arguments
parser = argparse.ArgumentParser()
parser.add_argument("--Yahoo_data_path", type=str, help="path to load Yahoo data")
parser.add_argument("--CHQ_summ_path", type=str, help="path to load CHQ-Summ dataset")
# Parse and print the results
args = parser.parse_args()
print(args)
def clean(text):
'''
to clean the text by removing undesirable spaces.
:param text:
:return:
'''
text = text.replace('\n', ' ')
text = text.replace('\t', ' ')
text = text.replace(' ', ' ')
text = text.replace(' ', ' ')
return text.strip()
def process_chq_summ(yahoo_data_items, chq_summ_data, mode, path_to_save):
with open(chq_summ_data, 'r') as rfile:
chq_dataset = json.load(rfile)
sources=[]
targets=[]
ids = []
for chq_data_item in chq_dataset:
if chq_data_item['id'] not in yahoo_data_items:
continue
meta_data = yahoo_data_items[chq_data_item['id']]
sources.append(meta_data['question']+' '+meta_data['content'])
targets.append(chq_data_item['human_summary'])
with open(os.path.join(path_to_save, mode+".id"), 'w') as f_id,\
open(os.path.join(path_to_save, mode+".source"), 'w') as f_src, open(os.path.join(path_to_save, mode+".target"), 'w') as f_tgt:
for id, src, tgt in zip(ids, sources, targets):
src = clean(src)
tgt = clean(tgt)
f_id.write(id.strip()+'\n')
f_src.write(src+'\n')
f_tgt.write(tgt+'\n')
print(f"Saved file in : {path_to_save}")
def read_yahoo_data(yahooPath='data/dataset/Yahoo-L6/FullOct2007.xml'):
data_items = {}
ctr = 0
for event, iter in etree.iterparse(yahooPath, tag="vespaadd", encoding='utf-8', recover=True):
doc = iter.find('document')
try:
meta_data = {}
q_id = doc.findtext('uri')
question = doc.findtext('subject')
content = doc.findtext('content')
meta_data['id'] = q_id
meta_data['question'] = question
meta_data['content'] = content
data_items[meta_data['id']]=meta_data
ctr += 1
if ctr%1000==0:
print("Read Questions: ", ctr)
except:
print('ERROR')
return data_items
def main(args):
yahoo_data_items = read_yahoo_data(args.Yahoo_data_path)
process_chq_summ(yahoo_data_items, os.path.join(args.CHQ_summ_path, 'train.json'),
mode='train',
path_to_save=args.CHQ_summ_path)
#
process_chq_summ(yahoo_data_items, os.path.join(args.CHQ_summ_path, 'val.json'),
mode='val',
path_to_save=args.CHQ_summ_path)
process_chq_summ(yahoo_data_items, os.path.join(args.CHQ_summ_path, 'test.json'),
mode='test',
path_to_save=args.CHQ_summ_path)
main(args)