-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMain.py
161 lines (135 loc) · 5.53 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import json
import AnimePageFetcher
import SeasonLinkScraper
import sys
data_directory = "data/"
fail_filename_format = "failures_%s.txt"
def grab_data(years, seasons=["winter", "fall", "summer", "spring"]):
print "Starting MAL scrap session."
for year in years:
print "Starting on Year", year
group = str(year)
directory = os.path.join(data_directory, group)
if not os.path.exists(directory):
os.makedirs(directory)
error_file_name = fail_filename_format % group
error_path = os.path.join(directory, error_file_name)
fail_file = open(error_path, 'a') # 'a' for append. Capture all failures until fix
for season in seasons:
print "Starting", str(group)+"_"+season
animeURLS = SeasonLinkScraper.get_season_anime(year, season)
AnimePageFetcher.cooldown()
print "Got all urls."
for url in animeURLS:
success, dataset = AnimePageFetcher.getAllDataFromUrl(url)
if not success:
safe_url = url
if dataset is not None:
safe_url = dataset.get("url", url)
fail_file.write(safe_url + "\n")
continue
file_name = str(dataset["id"]) + ".json"
path_name = os.path.join(directory, file_name)
with open(path_name, 'w') as f:
json.dump(dataset, f)
AnimePageFetcher.cooldown()
fail_file.flush()
fail_file.close()
print "Done year", year
def fix_fails(group):
group = str(group)
print "Starting to fix fails for group", group
directory = os.path.join(data_directory, group)
error_file_name = fail_filename_format % group
error_path = os.path.join(directory, error_file_name)
if not os.path.isfile(error_path):
print "Expected failure file", error_path, "does not exist."
return
with open(error_path, 'r') as f:
urls = [l.strip() for l in f.readlines()]
fails = []
for url in urls:
success, dataset = AnimePageFetcher.getAllDataFromUrl(url)
if not success:
safe_url = url
if dataset is not None:
safe_url = dataset.get("url", url)
fails.append(safe_url)
continue
data_file_name = str(dataset["id"]) + ".json"
data_path_name = os.path.join(directory, data_file_name)
with open(data_path_name, 'w') as f:
json.dump(dataset, f)
AnimePageFetcher.cooldown()
os.remove(error_path)
if len(fails) > 0:
with open(error_path, 'w') as f: # Mode 'w' for overwrite, since we're trying to fix things.
f.write("\n".join(fails))
print "Done fixing fails for group", group
# Iterates through a folder and checks all json objects have the correct keys
def validate_existing(group):
group = str(group)
directory = os.path.join(data_directory, group)
if not os.path.exists(directory):
print "Directory for group %s does not exist" % group
return
error_file_name = fail_filename_format % group
error_path = os.path.join(directory, error_file_name)
error_file = open(error_path, 'a')
print "Starting validation"
dir_list = os.listdir(directory)
for filename in os.listdir(directory):
if not filename.endswith(".json"):
continue
with open(os.path.join(directory, filename)) as f:
data = json.load(f)
if not AnimePageFetcher.validate(data):
print data["url"], "was not valid."
error_file.write(data["url"] + "\n")
error_file.close()
print "Done validation"
def check_missing(year):
group = str(year)
directory = os.path.join(data_directory, group)
if not os.path.exists(directory):
print "Directory for group %s does not exist" % group
return
dir_list = os.listdir(directory)
error_file_name = fail_filename_format % group
error_path = os.path.join(directory, error_file_name)
fail_file = open(error_path, 'a') # 'a' for append. Capture all failures until fix
print "Checking missing"
for season in ["winter","spring","summer","fall"]:
print "Grabbing links for", str(group)+"_"+season
animeURLS = SeasonLinkScraper.get_season_anime(year, season)
for url in animeURLS:
url = AnimePageFetcher.get_safe_url(url)
(cat, page_id) = AnimePageFetcher.getCategoryAndIDFromUrl(url)
if str(page_id)+".json" not in dir_list:
print "Missing", url
fail_file.write(url + "\n")
print "Done", str(group)+"_"+season
fail_file.close()
def clean_fail_files(group):
group = str(group)
directory = os.path.join(data_directory, group)
if not os.path.exists(directory):
print "Directory for group %s does not exist" % group
return
dir_list = os.listdir(directory)
error_file_name = fail_filename_format % group
error_path = os.path.join(directory, error_file_name)
fail_file = open(error_path, 'r') # 'r' for reading
if len(fail_file.read()) == 0:
sys.stdout.write("%s is empty. Deleting... " % error_path)
fail_file.close()
os.remove(error_path)
sys.stdout.write("Done.\n")
sys.stdout.flush()
if len(sys.argv) > 1:
grab_data([int(i) for i in sys.argv[1:]])
else:
print "Use: python Main.py <year1> <year2>"
for y in xrange(1998, 2016):
clean_fail_files(y)