-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcache.py
107 lines (82 loc) · 3.83 KB
/
cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import subprocess, requests, shutil, sys, bs4, re, os # Default Python libraries
from client.client import * # Importing The Client/Task Manager
from client.config import * # Importing The Config File
from typing import List # Type Hinting
# Global Variables Not In Config (Specific For Caching Only)
SITE: str = "https://cornellcuprobotics.com/"
OUTPUT_PATH: str = "resources/cache/"
DOWNLOAD_PATH: str = "resources/download/"
SIZE: str = "900x600!"
EXT: str = ".JPG"
def convert(urlname: str) -> str:
"""
Converts a given HTTP URL name to an acceptable file name, by stripping,
removing certain words, and adding spaces.
PARAMETERS
----------
urlname - The URL name to convert.
RETURNS
-------
str - The converted URL name.
"""
rmwords: List[str] = ["C1C0CS", "BIZCOMM", "MINIBOTCS", "ECE", "_", "-"]
for rmword in rmwords: urlname = urlname.replace(rmword, "")
for i in range(1, len(urlname)):
if urlname[i].isupper():
urlname = urlname[:i] + " " + urlname[i:]
break
return urlname.strip()
def cache_website() -> None:
"""
Pulls all files from the members page of the Cornell Cup Robotics website, filters
for specific image types, downloads them to a temporary directory, converts them
to a specific size using imagemagick, and then loads them onto the cache through
the client. Removes the temporary directories after completion.
"""
os.makedirs(DOWNLOAD_PATH, exist_ok=True)
os.makedirs(OUTPUT_PATH, exist_ok=True)
response: requests.Response = requests.get(SITE + "members.html")
soup: bs4.BeautifulSoup = bs4.BeautifulSoup(response.text, 'html.parser')
tags: List[bs4.element.Tag] = soup.find_all('img')
urls: List[str] = [img['src'] for img in tags]
urls: List[str] = [img for img in urls if EXT in img]
for i, url in enumerate(urls):
urlname: str = re.search(r'/([\w_-]+[.]JPG)$', url).group(1)
downloadname: str = DOWNLOAD_PATH + urlname[:-4] + ".jpg"
filename: str = OUTPUT_PATH + convert(urlname[:-4]) + ".jpg"
with open(downloadname, 'wb') as f:
if 'http' not in url: url = '{}{}'.format(SITE, url)
response = requests.get(url)
f.write(response.content)
subprocess.call(["magick", downloadname, "-resize", SIZE, filename])
print(f"Finished downloading image {i:02}: {url}")
print("Started loading all images onto cache.")
_ = Client(path=OUTPUT_PATH, open=False)
print("Finished loading all images onto cache.")
shutil.rmtree(DOWNLOAD_PATH)
shutil.rmtree(OUTPUT_PATH)
def cache_images(filenames: str) -> None:
"""
Extracts the file paths and makes sure they exists/are valid, then converts
them to a specific size using imagemagick, and then loads them onto the cache.
Removes the temporary directories after completion.
"""
os.makedirs(OUTPUT_PATH, exist_ok=True)
for filename in filenames:
if (not os.path.exists(filename)):
print(f"File {filename} does not exist, exiting.")
return
a_index: int = filename.rfind("/")+1 if "/" in filename else 0
b_index: int = filename.rfind(".") if "." in filename else len(filename)
diffname: str = OUTPUT_PATH + convert(filename[a_index:b_index]) + ".jpg"
subprocess.call(["convert", filename, "-resize", SIZE, diffname])
print(f"Converted {filename} to {diffname}")
print("Started loading all images onto cache.")
_ = Client(path=OUTPUT_PATH, open=False)
print("Finished loading all images onto cache.")
shutil.rmtree(OUTPUT_PATH)
if __name__ == '__main__':
# If No Arguments, Cache The Cornell Cup Robotics Website
if len(sys.argv) == 1: cache_website()
# Else, Cache The Given Files From Arguments
else: cache_images(sys.argv[1:])