-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_script_1m_1m.py
More file actions
40 lines (33 loc) · 1.04 KB
/
data_script_1m_1m.py
File metadata and controls
40 lines (33 loc) · 1.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import ast
import numpy as np
import argparse
if __name__ == "__main__":
# Opening JSON file
parser = argparse.ArgumentParser(
description="Extraction script for arxiv 1 month vs 1 month dataset"
)
parser.add_argument("--path", help="input jsonl file path", type=str, required=True)
args = parser.parse_args()
print(args.path)
f = open(args.path, "r")
data = f.readlines()
members = []
nonmembers = []
for d in data:
temp = ast.literal_eval(d)
text = temp["text"]
date = temp["meta"]["yymm"]
year = int(date[:2])
month = int(date[2:])
if year <= 22 or year > 50:
continue
else:
assert year == 23, (date, year)
if month == 2:
members.append(text)
elif month == 3:
nonmembers.append(text)
print(len(members))
print(len(nonmembers))
np.save("data/arxiv1m_1m/member.npy", np.asarray(members))
np.save("data/arxiv1m_1m/nonmember.npy", np.asarray(nonmembers))