-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcornExchange.py
166 lines (145 loc) · 7.22 KB
/
cornExchange.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from time import sleep
import re
import json
from datetime import datetime, timedelta, time
import uuid
import traceback
import os
import sys
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from icalendar import Calendar, Event, vBinary
# Check existance of executable ChromeDriver
DRIVER='.\\chromedriver.exe'
if not os.path.exists(DRIVER):
print(f'ERROR: ChromeDriver [{DRIVER}] not found')
sys.exit(1)
if not os.path.isfile(DRIVER):
print('ERROR: ChromeDriver [{DRIVER}] is not a file')
sys.exit(1)
if not os.access(DRIVER, os.X_OK):
print('ERROR: ChromeDriver [{DRIVER}] is not executable')
sys.exit(1)
service = Service(DRIVER)
service.start()
driver = webdriver.Remote(service.service_url)
driver.get('https://cornexchangenew.com/all-events/');
# iCalendar library to generate an iCal (.ics) file
cal = Calendar()
cal.add('VERSION', '2.0')
cal.add('PRODID', '-//Newbury Corn Exchange//')
cal.add('X-WR-CALNAME', 'Newbury Corn Exchange')
cal.add('X-WR-TIMEZONE', 'Europe/London')
cal.add('X-WR-CALDESC', 'Events at The Newbury Corn Exchange')
cal.add('X-WR-RELCALTYPE', 'PUBLIC')
cal.add('X-WR-RELCALNAME', 'Newbury Corn Exchange')
cal.add('X-WR-RELCALURL', 'https://cornexchangenew.com/all-events/')
cal.add('X-WR-RELCALPRIV', 'PUBLIC')
listing_page = True
while listing_page:
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
listings = soup.find('ul', attrs = {'class': 'row listing__item__list small-up-1 medium-up-1'})
for listing in listings:
try:
event = Event()
# Generate a unique ID for each event
event.add('UID', uuid.uuid4()) # Unique ID for the Event
event.add('CREATED', datetime.now()) # When the Event was created
event.add('DTSTAMP', datetime.now()) # When the Event was last modified
item = listing.article
subject_node = item.find('h2', attrs = {'class': 'listing__item__title'})
if subject_node and subject_node.text:
subject = subject_node.text
else:
subject = 'Not Stated'
event.add('SUMMARY', subject)
event_url = item.div.a.get("href")
event_date_node = item.find('p', attrs = {'class': 'listing__item__date'})
if event_date_node:
event_date = event_date_node.text.strip()
# Convert the event date to a date object from: "Friday 24th November 2023"
# Strip 'th', 'rd', 'nd', 'st'from the date string, and convert to a date object.
event_date = event_date.replace('th ', ' ')
event_date = event_date.replace('st ', ' ') #TODO: Kills 'August'
event_date = event_date.replace('rd ', ' ')
event_date = event_date.replace('nd ', ' ')
if '—' in event_date:
date_text = event_date.split('—')
start = date_text[0].strip()
end = date_text[1].strip()
print(f'** There is a date range stated for [{subject}] from [{start}] to [{end}] **')
captured_start_date = datetime.strptime(start, '%A %d %B %Y').date()
captured_end_date = datetime.strptime(end, '%A %d %B %Y').date()
else:
print(f'** There is single date stated for [{subject}] at [{event_date}] **')
captured_start_date = captured_end_date = datetime.strptime(event_date, '%A %d %B %Y').date()
else:
raise Exception(f'No event date specified for [{subject}] -- skipping')
venue_node = item.find('p', attrs = {'class': 'listing__item__venue'})
if (venue_node):
event_location = venue_node.text.strip()
else:
event_location = 'Not Stated'
event.add('LOCATION', event_location)
# Get the detailed event description for event time, date and price
driver.get(event_url);
event_soup = BeautifulSoup(driver.page_source, 'html.parser')
price = event_soup.find('h2', attrs = {'class': 'event__meta__tickets'})
event_time_node = event_soup.find('a', attrs = {'data-tooltip-content': '#tooltip_content__1'})
if (event_time_node):
event_time = event_time_node.text.strip()
print(f'** There is a time stated for [{subject}] on [{event_date}] at [{event_time}] **')
# Remove all text beyond the am or pm from the time string
event_time = re.sub(r'm.*$', 'm', event_time, flags=re.S)
time_text = re.findall(r'\d{1,2}:\d\d', event_time)
if time_text:
hours = time_text[0].split(':')[0]
if event_time.find('pm') != -1:
hours = int(hours) + 12
minutes = time_text[0].split(':')[1]
full_datetime = datetime.combine(
captured_start_date,
time(hour=int(hours), minute=int(minutes))
)
else:
print(f'** Cannot parse a time from [{event_time}], so just using the date **')
# Don't have a time specified, so just use the start date
full_datetime = captured_start_date
event.add('DTSTART', full_datetime)
# Assume the event in 2 hours long
event.add('DTEND', full_datetime + timedelta(hours=2))
else:
# Don't have a time specified, so just use the start date
print(f'** Cannot parse a time from [{event_time}], so just using the date **')
event.add('DTSTART', captured_start_date)
event.add('DTEND', captured_start_date)
price_text = price.text.strip()
description = item.find('div', attrs = {'class': 'listing__item__summary'}).p.text
if '/' in price_text:
# There are multiple prices quoted
price_text = price_text.split('/')
prices = []
for p in price_text:
prices.append(p.strip() + '\n')
price_text = prices
else:
price_text = [price_text]
full_description = f'Details: {event_url}\n\n{description}\n\nPrice: {price_text}'
event.add('DESCRIPTION', f'{full_description}')
# Add the event to the calendar
# print(f'Adding New Event:\n{event}\n==========================================')
cal.add_component(event)
except Exception as e:
print(f'**Problem parsing listing [{listing.prettify()}]: {e}: [{traceback.print_exc()}]')
# Find more pages
listing_page = soup.find('a', attrs = {'class': 'pagination__item ias-next btn btn--primary btn--small btn--next'})
if listing_page:
driver.get(listing_page.get('href'))
driver.quit()
print(f'Calendar [{cal.name}] has [{len(cal.items())}] entries')
ics_file = open('cornExchange.ics', 'wb')
ics_file.write(cal.to_ical())
ics_file.close()