-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapeMars.py
239 lines (166 loc) · 9.29 KB
/
scrapeMars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# --- dependencies and setup ---
from bs4 import BeautifulSoup
import pandas as pd
from splinter import Browser
import time
def init_browser():
# @NOTE: Replace the path with your actual path to the chromedriver
#For mac users
executable_path = {"executable_path": "C:\Webdrivers\chromedriver"}
return Browser("chrome", **executable_path, headless=True)
#For windows users
# executable_path = {'executable_path': 'driver/chromedriver.exe'}
# browser = Browser('chrome', **executable_path, headless=False)
def scrape():
browser = init_browser()
# ******************************************************************************************************************************
# Scraping Mars News
# *****************************************************************************************************************************
MarsNews_url = 'https://mars.nasa.gov/news/'
print("Scraping Mars News...")
# --- visit the Mars News website ---
browser.visit(MarsNews_url)
time.sleep(1)
# --- create HTML object ---
html = browser.html
# --- parse HTML with BeautifulSoup ---
soup = BeautifulSoup(html, 'html.parser')
# --- get the first <li> item under <ul> list of headlines: this contains the latest news title and paragraph text ---
first_li = soup.find('li', class_='slide')
# --- save the news title under the <div> tag with a class of 'content_title' ---
news_title = first_li.find('div', class_='content_title').text
# --- save the paragraph text under the <div> tag with a class of 'article_teaser_body' ---
news_para = first_li.find('div', class_='article_teaser_body').text
print("Mars News: Scraping Complete!")
# *****************************************************************************************************************************
# Scraping JPL Featured Image URL
# *****************************************************************************************************************************
JPLimage_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
print("Scraping JPL Featured Space Image...")
# --- visit the JPL Featured Space Image website ---
browser.visit(JPLimage_url)
time.sleep(1)
# --- create HTML object ---
html = browser.html
# --- parse HTML with BeautifulSoup ---
soup = BeautifulSoup(html, 'html.parser')
# --- get the <div> with a class of 'carousel_container': this contains the current featured image details ---
carousel = soup.find('div', class_='carousel_container')
# --- get the image title found under the <a> tag ---
featuredimage_title = carousel.find('a')['data-title']
# --- use splinter to click on the 'full image' button to retrieve a full-size jpg url ---
browser.find_by_id('full_image').click()
time.sleep(1)
# --- check if the div with the 'more info' button is visible to proceed to the download page. If false: ---
if browser.is_element_visible_by_css('div.fancybox-title') == False:
# --- create the base url for the image from the carousel container ---
base_url = 'https://www.jpl.nasa.gov/'
# --- get the image url found under the <a> tag in the carousel ---
image_url = carousel.find('a')['data-fancybox-href']
# --- complete the featured image url by adding the base url ---
featuredimage_url = base_url + image_url
# --- if the div is visible and there is a 'more info' button to proceed ---
else:
# --- create the base url for the fullsize image download link ---
base_url = 'https:'
# --- click the 'more info' button to go to the image detail page ---
browser.links.find_by_partial_text('more info').click()
time.sleep(1)
# --- create a beautiful soup object with the image detail page's html ---
img_detail_html = browser.html
imagesoup = BeautifulSoup(img_detail_html, 'html.parser')
# --- find the fullsize jpg image link and store the url ---
download_div = imagesoup.find_all('div', class_='download_tiff')[1]
fullsize_img = download_div.find('a')['href']
# --- complete the featured image url by adding the base url ---
featuredimage_url = base_url + fullsize_img
print("JPL Featured Space Image: Scraping Complete!")
# *****************************************************************************************************************************
# Scraping Mars Weather Tweet
# *****************************************************************************************************************************
MarsWeather_url = 'https://twitter.com/marswxreport'
print("Scraping Mars Weather's Twitter Account...")
# --- visit the Mars Weather twitter account ---
browser.visit(MarsWeather_url)
time.sleep(5)
# --- create HTML object ---
html = browser.html
# --- parse HTML with BeautifulSoup ---
soup = BeautifulSoup(html, 'html.parser')
# --- save the latest tweet in a variable (found in the text of the first element <span> under the <div> tag with lang="en" ---
tweet = soup.find_all('div', lang='en')[0].text
# --- clean up the tweet (remove newline) ---
latest_tweet = tweet.replace('\n', '')
print("Mars Weather: Scraping Complete!")
# *****************************************************************************************************************************
# Scraping Mars Facts
# *****************************************************************************************************************************
MarsFacts_url = 'https://space-facts.com/mars/'
print("Scraping Mars Facts...")
# --- visit the Mars Facts website ---
browser.visit(MarsFacts_url)
time.sleep(1)
# --- create HTML object ---
html = browser.html
# --- use Pandas to scrape table of facts ---
table = pd.read_html(html)
# --- use indexing to slice the table to a dataframe ---
facts_df = table[0]
facts_df.columns =['Description', 'Value']
# --- convert the dataframe to a HTML table and pass parameters for styling ---
html_table = facts_df.to_html(index=False, header=False, border=0, classes="table table-sm table-striped font-weight-light")
print("Mars Facts: Scraping Complete!")
# *****************************************************************************************************************************
# Scraping Mars Hemisphere images
# *****************************************************************************************************************************
MarsHemImage_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
print("Scraping Mars Hemisphere Images...")
# --- visit the Mars Hemisphere website ---
browser.visit(MarsHemImage_url)
time.sleep(1)
# --- create HTML object ---
html = browser.html
# --- parse HTML with BeautifulSoup ---
soup = BeautifulSoup(html, 'html.parser')
# --- retrieve all the parent div tags for each hemisphere ---
hemisphere_divs = soup.find_all('div', class_="item")
# --- create an empty list to store the python dictionary ---
hemisphere_image_data = []
# --- loop through each div item to get hemisphere data ---
for hemisphere in range(len(hemisphere_divs)):
# --- use splinter's browser to click on each hemisphere's link in order to retrieve image data ---
hem_link = browser.find_by_css("a.product-item h3")
hem_link[hemisphere].click()
time.sleep(1)
# --- create a beautiful soup object with the image detail page's html ---
img_detail_html = browser.html
imagesoup = BeautifulSoup(img_detail_html, 'html.parser')
# --- create the base url for the fullsize image link ---
base_url = 'https://astrogeology.usgs.gov'
# --- retrieve the full-res image url and save into a variable ---
hem_url = imagesoup.find('img', class_="wide-image")['src']
# --- complete the featured image url by adding the base url ---
img_url = base_url + hem_url
# --- retrieve the image title using the title class and save into variable ---
img_title = browser.find_by_css('.title').text
# --- add the key value pairs to python dictionary and append to the list ---
hemisphere_image_data.append({"title": img_title, "img_url": img_url})
# --- go back to the main page ---
browser.back()
# --- Quit the browser after scraping ---
browser.quit()
print("Mars Hemisphere Images: Scraping Complete!")
# *****************************************************************************************************************************
# Store all values in dictionary
# *****************************************************************************************************************************
scraped_data = {
"news_title": news_title,
"news_para": news_para,
"featuredimage_title": featuredimage_title,
"featuredimage_url": featuredimage_url,
"latest_tweet": latest_tweet,
"mars_fact_table": html_table,
"hemisphere_images": hemisphere_image_data
}
# --- Return results ---
return scraped_data