-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollect_script.py
67 lines (57 loc) · 2.22 KB
/
collect_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""
collect_script.py
Scheduled script to collect data for webapp.
Part of the "Instant Serotonin" project by OperaVaria.
"""
# External imports:
import pickle
from pathlib import Path
from yaml import safe_load
# Import scraping functions:
from scraping.pexels_scrape import pexels_collect
from scraping.pixabay_scrape import pixabay_collect
from scraping.reddit_scrape import reddit_collect
from scraping.unsplash_scrape import unsplash_collect
def scrape(animal_name, query, subreddits, keys):
"""Call collecting functions and assemble pickle file form retuned lists."""
# Reddit
reddit_list = reddit_collect(subreddits, keys)
# Pexels
pexels_list = pexels_collect(query, keys)
# Pixabay
pixabay_list = pixabay_collect(query, keys)
# Unsplash
unsplash_list = unsplash_collect(query, keys)
# Join lists.
# Sand cat and manul results are problematic, therefore skip sites.
if animal_name == "sand_cat":
post_list = reddit_list
elif animal_name == "manul":
post_list = reddit_list + pixabay_list + unsplash_list
else:
post_list = reddit_list + pexels_list + pixabay_list + unsplash_list
# Pickle data.
pfile_path = (Path(__file__).parents[0].resolve()).joinpath(f"data/{animal_name}_data.p")
with open(pfile_path, "wb") as file:
pickle.dump(post_list, file)
# Print number of items.
return print(f"Number of {animal_name} posts: {str(len(post_list))}")
def main():
"""Image scraping main function."""
# Load login information file.
keys_path = (Path(__file__).parents[0].resolve()).joinpath("scraping/auth/keys.yaml")
with open(keys_path, "r", encoding="utf-8") as keys_file:
keys = safe_load(keys_file)
# Collect capybara content.
scrape("capybara", "capybara", "capybara+capybaras+capybarasoncritters", keys)
# Collect hedgehog content.
scrape("hedgehog", "hedgehog", "Hedgehog+Hedgehogs+HedgehogsAreLiquid", keys)
# Collect manul content.
scrape("manul", "manul", "PallasCats+manuls", keys)
# Collect sand cat content.
scrape("sand_cat","'sand cat'", "sandcats", keys)
# Print final message.
return print("Scraping completed.")
# Run main function.
if __name__ == '__main__':
main()