-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspacex_webscrap_week1_2.py
386 lines (315 loc) · 193 KB
/
spacex_webscrap_week1_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
# -*- coding: utf-8 -*-
"""spaceX_WebScrap_week1_2.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1oCDR1NYtHnwYhR-8PQ86qK_gwpThuDUX
# Web scraping (also known as web data extraction)
is the process of automatically collecting data from websites. This is done using programs called web scrapers or web robots. These programs download web pages, parse their HTML structure, and extract the desired data.
In this lab, you will be performing web scraping to collect Falcon 9 historical launch records from a Wikipedia page titled List of Falcon 9 and Falcon Heavy launches
Web scraping Falcon 9 and Falcon Heavy Launches Records from Wikipedia
https://en.wikipedia.org/wiki/List_of_Falcon\_9\_and_Falcon_Heavy_launches

#Objectives
Web scrap Falcon 9 launch records with BeautifulSoup:
Extract a Falcon 9 launch records HTML table from Wikipedia
Parse the table and convert it into a Pandas data frame
First let's import required packages for this lab
"""
#Web scraping can be a powerful tool for collecting data from the web. However, it is important to use it responsibly and ethically. You should always respect the robots.txt file of a website, and you should never scrape data that is not publicly available.
#Beautiful Soup 4 is a library for parsing HTML and XML documents. It is used to extract data from HTML pages by navigating through the HTML tree and finding the elements that contain the data you want. Beautiful Soup 4 is a very powerful tool for parsing HTML, and it can handle even the most complex and messy HTML documents.
!pip install beautifulsoup4
#Requests is a library for making HTTP requests. It is used to send requests to websites and get the HTML content of the pages. Requests is a very simple and easy-to-use library, and it is a great choice for making simple HTTP requests.
!pip install requests
"""We're importing six essential libraries."""
import sys #This module provides information about the Python runtime environment and functions for exiting the program and interacting with the command line.
import requests #This library is used to send HTTP requests to websites and receive responses.
from bs4 import BeautifulSoup #This library is used to parse HTML and XML documents.
import re #This module provides functions for pattern matching in strings.
import unicodedata #This module provides functions for working with Unicode characters.
import pandas as pd #This library is used for working with tabular data and data analysis.
import warnings #Identify potential issues before they become errors #Debug errors and bugs #Document code with additional information
"""Additionally, we'll offer helper functions to assist you in processing web-scraped HTML tables."""
def date_time(table_cells): #Purpose: Extracts the date and time from a table cell element.
"""
This function returns the data and time from the HTML table cell
Input: the element of a table data cell extracts extra row
"""
return [data_time.strip() for data_time in list(table_cells.strings)][0:2]
def booster_version(table_cells): #Purpose: Extracts the booster version from a table cell element. #Uses a list comprehension with enumerate to iterate over strings in table_cells
"""
This function returns the booster version from the HTML table cell
Input: the element of a table data cell extracts extra row
"""
out=''.join([booster_version for i,booster_version in enumerate( table_cells.strings) if i%2==0][0:-1]) #Filters strings by selecting only elements at even indices (0, 2, 4, ...) using the modulo operator (%).
return out
def landing_status(table_cells): #Purpose: Extracts the landing status from a table cell element.
"""
This function returns the landing status from the HTML table cell
Input: the element of a table data cell extracts extra row
"""
out=[i for i in table_cells.strings][0]
return out
def get_mass(table_cells): #Purpose: Extracts the mass (including the unit) from a table cell element.
mass=unicodedata.normalize("NFKD", table_cells.text).strip()
if mass:
mass.find("kg")
new_mass=mass[0:mass.find("kg")+2]
else:
new_mass=0
return new_mass
def extract_column_from_header(row): #Purpose: Cleans and extracts the column name from a table header row element.
"""
This function returns the landing status from the HTML table cell
Input: the element of a table data cell extracts extra row
"""
if row.br: #Removes any <br> (line break) tags from the row using .extract().
row.br.extract()
if row.a: #Removes any anchor tags (<a>) from the row using .extract().
row.a.extract()
if row.sup: #Removes any superscript tags (<sup>) from the row using .extract().
row.sup.extract()
colunm_name = ' '.join(row.contents)
# Filter the digit and empty names
if not(colunm_name.strip().isdigit()):#Strips leading/trailing whitespace using .strip(). #Checks if the column name contains only digits using .isdigit().
colunm_name = colunm_name.strip() #If it contains only digits (likely not a valid column name):Discards the row and returns None.
return colunm_name #If it's a valid column name (not just digits):Returns the cleaned and stripped column name.
"""To uphold consistency in lab tasks, you are required to retrieve data from a time-stamped version of the List of Falcon 9 and Falcon Heavy launches Wiki Page, dated June 9, 2021."""
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"
"""Subsequently, fetch the HTML content from the specified URL and store the response in an object
#TASK 1: Request the Falcon9 Launch Wiki page from its URL
To begin, we'll initiate an HTTP GET request to retrieve the Falcon9 Launch HTML page and obtain an HTTP response
"""
# use requests.get() method with the provided static_url
# assign the response to a object
page = requests.get(static_url)
page.status_code
"""Create a BeautifulSoup object from the HTML response"""
# Create a BeautifulSoup object from the HTML response
# Use BeautifulSoup() to create a BeautifulSoup object from a response text content
soup = BeautifulSoup(page.text, 'html.parser')
#Use soup.title attribute
soup.title
"""#TASK 2 : Extract all column/variable names from the HTML table header
Next, we want to collect all relevant column names from the HTML table header
Let's try to find all tables on the wiki page first. If you need to refresh your memory about BeautifulSoup, please check the external reference link towards the end of this lab
"""
# Use the find_all function in the BeautifulSoup object, with element type `table`
# Assign the result to a list called `html_tables`
#This code is a part of a Python script that utilizes the BeautifulSoup library to parse HTML content.
#html_tables: This is a variable that will store a list of table objects.
#soup: This is a BeautifulSoup object that represents the parsed HTML content.
#find_all('table'): This method finds all <table> tags within the HTML content associated with the soup object.
html_tables = soup.find_all('table')
"""Starting from the third table is our target table contains the actual launch records."""
# Let's print the third table and check its content
#first_launch_table: This is a variable name chosen to describe the table being assigned.
# =: This is the assignment operator, used to store the value on the right side in the variable on the left.
#html_tables: This refers back to the variable you previously used to store the list of all tables found using soup.find_all('table').
#[2]: This is indexing notation used to access specific elements within a list. In Python, indexing starts from 0, so [2] refers to the third element (index 2) in the html_tables list.
first_launch_table = html_tables[2]
print(first_launch_table)
"""You should able to see the columns names embedded in the table header elements as follows:
Next, we just need to iterate through the elements and apply the provided extract_column_from_header() to extract column name one by one
"""
column_names = []
# Apply find_all() function with `th` element on first_launch_table
# Iterate each th element and apply the provided extract_column_from_header() to get a column name
# Append the Non-empty column name (`if name is not None and len(name) > 0`) into a list called column_names
#In summary:
#These comments explain the steps involved in extracting column names from the header row of a specific table (first_launch_table) on a webpage. It utilizes BeautifulSoup's find_all() function and a custom function (extract_column_from_header()) to achieve this.
#The for i in first_launch_table.find_all('th') line initiates a for loop.
#This loop iterates over all the th elements (table header cells) found within the first_launch_table object.
#The i variable in each iteration represents a single th element (header cell).
#if extract_column_from_header(i) != None:
#This conditional statement checks if the extract_column_from_header function has successfully extracted a column name from the current th element (header cell).
#If the extract_column_from_header function returns None, it means it failed to extract a column name, and the loop moves on to the next header cell.\
#if extract_column_from_header(i) != None:
#This conditional statement checks if the extract_column_from_header function has successfully extracted a column name from the current th element (header cell).
#If the extract_column_from_header function returns None, it means it failed to extract a column name, and the loop moves on to the next header cell.
#column_names.append(extract_column_from_header(i))
#If both the above conditions are met (i.e., a valid, non-empty column name is extracted), the extracted column name is appended to a list called column_names.
#The column_names list will ultimately contain all the valid and non-empty column names extracted from the table's header row.
for i in first_launch_table.find_all('th'):
if extract_column_from_header(i) != None:
if len (extract_column_from_header(i))>0:
column_names.append(extract_column_from_header(i))
#This code systematically goes through the header cells of the table, extracts column names from their content (assuming the extract_column_from_header function handles that), and stores only valid, non-empty column names in a list named column_names.
print(column_names)
"""#TASK 3: Create a data frame by parsing the launch HTML tables
We will create an empty dictionary with keys from the extracted column names in the previous task. Later, this dictionary will be converted into a Pandas dataframe
"""
#This code block focuses on constructing a DataFrame using the information extracted from the launch HTML tables. It accomplishes this in several steps:
#launch_dict= dict.fromkeys(column_names)
#This line creates an empty dictionary named launch_dict.
#The dict.fromkeys() method is used, which takes an iterable (like a list) as input and creates a dictionary with keys from that iterable.
#In this case, the column_names list (presumably obtained from the previous task) is used as the input.
#So, the launch_dict will now have keys corresponding to all the column names found in the table header.
launch_dict= dict.fromkeys(column_names)
#del launch_dict['Date and time ( )']
#This line removes a specific key-value pair from the launch_dict.
#It uses the del keyword to delete the key "Date and time ( )" from the dictionary.
#This assumes you don't want this particular column in your final DataFrame.
# Remove an irrelvant column
del launch_dict['Date and time ( )']
#The subsequent lines (launch_dict['Flight No.'] = [], etc.)
#These lines iterate through a subset of the column names in launch_dict (likely the ones you're interested in).
#For each column name, an empty list is assigned as its value in the launch_dict.
#This creates a structure where each key (column name) has an associated empty list, which will eventually hold the corresponding data points for that column.
# Let's initial the launch_dict with each value to be an empty list
#These lines (if present) add new keys to the launch_dict with empty lists as their values.
#This allows you to incorporate additional data columns if they exist in the HTML tables and you want them in the DataFrame.
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []
# Added some new columns
launch_dict['Version Booster']=[]
launch_dict['Booster landing']=[]
launch_dict['Date']=[]
launch_dict['Time']=[]
#Overall, this code snippet lays the groundwork for populating a DataFrame with data extracted from the launch HTML tables. It sets up the structure with column names and empty lists to hold the corresponding data points.
"""Next, we just need to fill up the launch_dict with launch records extracted from table rows.
Usually, HTML tables in Wiki pages are likely to contain unexpected annotations and other types of noises, such as reference links B0004.1[8], missing values N/A [e], inconsistent formatting, etc. To simplify the parsing process, we have provided an incomplete code snippet below to help you to fill up the launch_dict. Please complete the following code snippet with TODOs or you can choose to write your own logic to parse all launch tables:
"""
#Error Handling: The booster_version case now handles situations where it doesn't return a value, using row[1].text.strip() to potentially extract content from the cell.
#Clarity: Cell text extraction consistently uses .text.strip() for clarity and potential handling of extra whitespace.
#Conciseness: Unnecessary comments like #print(…) have been removed to focus on the code's functionality.
#Assumptions: The explanation clarifies the assumption that there are always 9 columns in each launch row.
extracted_row = 0 #Initializes a counter extracted_row to keep track of the number of extracted launch records.
#Extract each table
#Loops through all tables that match the criteria ("wikitable plainrowheaders collapsible") in the HTML content parsed by BeautifulSoup.
#enumerate assigns an index (table_number) to each table
for table_number,table in enumerate(soup.find_all('table',"wikitable plainrowheaders collapsible")):
# get table row
for rows in table.find_all("tr"):
#check to see if first table heading is as number corresponding to launch a number
if rows.th:
if rows.th.string:
flight_number=rows.th.string.strip()
flag=flight_number.isdigit()
else:
flag=False
#get table element
row=rows.find_all('td')
#if it is number save cells in a dictonary
if flag:
extracted_row += 1
# Flight Number value
# TODO: Append the flight_number into launch_dict with key `Flight No.`
launch_dict['Flight No.'].append(flight_number)
#print(flight_number)
datatimelist=date_time(row[0])
# Date value
# TODO: Append the date into launch_dict with key `Date`
date = datatimelist[0].strip(',')
launch_dict['Date'].append(date)
#print(date)
# Time value
# TODO: Append the time into launch_dict with key `Time`
time = datatimelist[1]
launch_dict['Time'].append(time)
#print(time)
# Booster version
# TODO: Append the bv into launch_dict with key `Version Booster`
bv=booster_version(row[1])
if not(bv):
bv=row[1]
#print(bv)
launch_dict['Version Booster'].append(bv)
# Launch Site
# TODO: Append the bv into launch_dict with key `Launch Site`
launch_site = row[2]
launch_dict['Launch site'].append(launch_site)
#print(launch_site)
# Payload
# TODO: Append the payload into launch_dict with key `Payload`
payload = row[3]
launch_dict['Payload'].append(payload)
#print(payload)
# Payload Mass
# TODO: Append the payload_mass into launch_dict with key `Payload mass`
payload_mass = get_mass(row[4])
launch_dict['Payload mass'].append(payload_mass)
#print(payload)
# Orbit
# TODO: Append the orbit into launch_dict with key `Orbit`
orbit = row[5]
launch_dict['Orbit'].append(orbit)
#print(orbit)
# Customer
# TODO: Append the customer into launch_dict with key `Customer`
customer = row[6]
launch_dict['Customer'].append(customer)
#print(customer)
# Launch outcome
# TODO: Append the launch_outcome into launch_dict with key `Launch outcome`
launch_outcome = list(row[7].strings)[0]
launch_dict['Launch outcome'].append(launch_outcome)
#print(launch_outcome)
# Booster landing
# TODO: Append the launch_outcome into launch_dict with key `Booster landing`
booster_landing = landing_status(row[8])
launch_dict['Booster landing'].append(booster_landing)
#print(booster_landing)
headings = []
for key,values in dict(launch_dict).items():
if key not in headings:
headings.append(key)
if values is None:
del launch_dict[key]
#The primary purpose of the pad_dict_list function is to standardize the length of lists within a dictionary. It achieves this by padding shorter lists with a specified value (padel) to match the length of the longest list in the dictionary. This ensures that all lists in the dictionary have the same size, which can be beneficial for various downstream tasks, such as data processing, analysis, or visualization.
#The function initializes a variable lmax to 0, which will store the maximum length of any list found in the dictionary.
#It iterates through all the keys (lname) in the dict_list. For each key, it retrieves the corresponding list using dict_list[lname] and calculates its length using len(dict_list[lname]).
#The max function is used to compare the current list length with the current value of lmax. If the current list length is greater than lmax, it is assigned to lmax, ensuring that lmax always holds the maximum length found so far.
#After finding the maximum length (lmax), the function iterates over the keys (lname) again.
#For each key, it calculates the current list length (l1) using len(dict_list[lname]).
#An if condition checks if the current list length (l1) is less than the maximum length (lmax).
#If the condition is True (meaning the current list is shorter), it proceeds to pad the list.
#It creates a new list of size lmax - l1 filled with the padel value using [padel] * (lmax - l1).
#The += operator appends this padding list to the existing list in dict_list[lname]. This effectively extends the shorter list to match the maximum length.
#Finally, the function returns the modified dict_list. This dictionary now contains lists with uniform lengths, ensuring consistency and simplifying subsequent operations.
#Data Consistency: Padding lists ensures that all lists within the dictionary have the same length, making it easier to process and analyze the data uniformly.
#Compatibility with Operations: Many data processing libraries and algorithms expect input data to have consistent structures. Padding lists can help ensure compatibility with these tools.
#Simplified Visualization: When visualizing data from a dictionary, having lists of equal length can lead to cleaner and more aligned visualizations, making it easier to interpret patterns and trends.
#Overall, the pad_dict_list function serves as a valuable tool for standardizing list lengths within dictionaries, promoting data consistency and facilitating downstream tasks.
def pad_dict_list(dict_list, padel):
lmax = 0
for lname in dict_list.keys():
lmax = max(lmax, len(dict_list[lname]))
for lname in dict_list.keys():
l1 = len(dict_list[lname])
if l1 < lmax:
dict_list[lname] += [padel] * (lmax - l1)
return dict_list
pad_dict_list(launch_dict,0)
#This line of code calls a function named pad_dict_list and passes the launch_dict dictionary and the value 0 as arguments.
#The pad_dict_list function is likely designed to modify the launch_dict by ensuring all lists within it have the same length. It achieves this by padding shorter lists with the provided value (0).
#The result of this modification (the potentially padded launch_dict) is then printed to the console using the print function.
print(pad_dict_list(launch_dict, 0))
"""After you have fill in the parsed launch record values into launch_dict, you can create a dataframe from it."""
#In essence, this code takes a dictionary named launch_dict and converts it into a tabular format using a pandas DataFrame. The dictionary keys become column names in the DataFrame, and the corresponding values become the data in each column.
df = pd.DataFrame(launch_dict)
#df.loc: This refers to a way to access rows and columns by label in a pandas DataFrame.
#[: , 'Flight No.']: This part defines which rows and columns to select.
#[: , ]: This selects all rows (indicated by the colon :).
#'Flight No.': This selects the column named "Flight No." (including the space).
#=: This is the assignment operator, assigning the result on the right to the element selected on the left.
#list(range(1, df.shape[0]+1)): This creates a list of flight numbers.
#range(1, df.shape[0]+1): This part generates a sequence of numbers using the range function.
#1: This is the starting number (inclusive).
#df.shape[0]: This retrieves the number of rows in the DataFrame (df.shape gives the dimensions, and [0] refers to the number of rows).
#+1: This adds 1 to the number of rows to make the range inclusive from 1 to the total number of rows.
df.loc[:,'Flight No.'] = list(range(1, df.shape[0]+1))
#In simpler terms: This code replaces the values in the "Flight No." column of the DataFrame df with a new list of flight numbers. The flight numbers start from 1 and increment by 1 up to the total number of rows in the DataFrame.
df
"""We can now export it to a CSV for the next section, but to make the answers consistent and in case you have difficulties finishing this lab.
Following labs will be using a provided dataset to make each lab independent.
"""
#df.to_csv(...): This calls the to_csv method on the DataFrame df. This method is used to export the DataFrame data to a CSV file.
#'spacex_web_scraped.csv': This is the name of the CSV file that will be created (including the path if specified).
#index=False: This is an optional argument that specifies how to handle the DataFrame index (the row labels). By setting it to False, we instruct the method to not include the index in the exported CSV file. This keeps the CSV file cleaner and more consistent with the format expected in following labs.
df.to_csv('spacex_web_scraped.csv', index=False)
#In simpler terms: This code saves the data from the DataFrame df into a CSV file named "spacex_web_scraped.csv". It excludes the row labels (index) from the exported data.