This repository was archived by the owner on Jul 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
111 lines (90 loc) · 3.8 KB
/
scrape.py
File metadata and controls
111 lines (90 loc) · 3.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import re
import os
import time
import requests
from prettytable import PrettyTable, ALL
from requests_html import HTMLSession
from datetime import datetime
from bs4 import BeautifulSoup
def scrape(search_url, cond=None):
session = HTMLSession()
if cond == 'New':
url = f'https://web.archive.org/web/20240000000000*/{search_url}'
else:
url = search_url
try:
response = session.get(url)
response.html.render(sleep=6, timeout=50)
session.close()
return response
except Exception as e:
print(f"An error occurred: {e}")
session.close()
return None
def termResults(response):
if response is not None:
table = PrettyTable(["Link", "Desc", "Web Pages", "Images", "Audio", "VID/GIF"])
links = response.html.find('.result-item-heading')
descs = response.html.find('.snippet')
url_types = response.html.find('.urls-types')
types = [element.find('b') for element in url_types]
for link, desc, numbers in zip(links, descs, types):
table.add_row([link.text, desc.text] + [web.text for web in numbers[::4]] + [img.text for img in numbers[1::4]] + [aud.text for aud in numbers[2::4]] + [vid.text for vid in numbers[3::4]])
print(table)
def getInfo(response):
if response is not None:
result_details_elements = response.html.find('.captures-range-info')
for info_element in result_details_elements:
info_text = info_element.text
return info_text
def month_name_to_number(month_name):
try:
date_object = datetime.strptime(month_name, "%B")
except ValueError:
date_object = datetime.strptime(month_name, "%b")
return date_object.month
def is_between_dates(start_date, check_date, end_date):
if start_date <= check_date <= end_date:
return True
else:
return False
def makeUrl(info, url, date, time):
parts = info.split('between')
if len(parts) == 2:
date_range = parts[1].split('and')
start_date = date_range[0].strip()
end_date = date_range[1].strip()
start_parts = start_date.split(' ')
start_year = int(start_parts[-1])
start_month = int(month_name_to_number(start_parts[0][:3].upper()))
start_day = int(start_parts[1].rstrip(','))
start_date_full = int(f"{start_year:04d}{start_month:02d}{start_day:02d}")
end_parts = end_date.split(' ')
end_year = int(end_parts[-1].rstrip('.'))
end_month = int(month_name_to_number(end_parts[0][:3].upper()))
end_day = int(end_parts[1].rstrip(','))
end_date_full = int(f"{end_year:04d}{end_month:02d}{end_day:02d}")
if date:
checkDate = is_between_dates(start_date_full, int(date), end_date_full)
if checkDate is True:
if time:
snapshot_url = f'https://web.archive.org/web/{date}{time}/{url}'
else:
snapshot_url = f'https://web.archive.org/web/{date}000000/{url}'
return snapshot_url
else:
print("Date Invalid.")
def find(terms, url):
table = PrettyTable(["Matches"])
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
pattern = re.compile(r'>\s*(.*?)\s*<')
matches = [match.group(1).strip() for match in pattern.finditer(str(soup))]
result_matches = {term: [match for match in matches if term in match] for term in terms}
for term, term_matches in result_matches.items():
if term_matches:
print(f'\nMatches for "{term}":')
for match in term_matches:
print(f'• {match}')
else:
print(f'\nNo matches found for "{term}".')