-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset_crawler.py
34 lines (27 loc) · 976 Bytes
/
dataset_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# make sure to install all the dependencies prior running the script. happy hacking xD
'''
author @DravitLochan
'''
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "http://www.numberplates.com/pr-number-plate-gallery.asp?page="
urlSite = "www.numberplates.com/"
def main():
f = open("data_set_urls.txt", "w")
# urls = []
for page_num in range(510, 0, -1):
# url = url + str(page_num)
page = urlopen(url + str(page_num))
soup = BeautifulSoup(page)
all_anchors = soup.find_all('a')
for anchor in all_anchors:
if anchor.has_attr('name'):
img = anchor.find_all('img')
if img[0].has_attr('src'):
print(urlSite + str(img[0]['src']))
# urls.append(urlSite + str(img[0]['src']))
f.write(urlSite + str(img[0]['src']) + "\n")
# f.write(urls)
f.close()
if __name__ == "__main__":
main()