-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scraping_5.py
123 lines (83 loc) · 2.98 KB
/
web_scraping_5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from bs4 import BeautifulSoup
import requests
import re
# Passing url using Beautiful soup Html parser and getting the document
url = "https://coinmarketcap.com/"
result = requests.get(url).text
doc = BeautifulSoup(result, "html.parser")
# --------------------------------------------------
# --------------------------------------------------
# Example Tree Structure:
# <table>
# <thead>
# <tr>
# <th> </th> # Siblings 1
# <th> </th> # Siblings 1
# <th> </th> # Siblings 1
# </tr>
# </thead>
# <tbody>
# <tr> </tr> # Siblings 2
# <tr> </tr> # Siblings 2
# <tr> </tr> # Siblings 2
# <tr> </tr> # Siblings 2
# </tbody>
# </table>
# Here:
# -> Parent of <tr> is <tbody>
# -> Parent of <th> is <tr>
# -> Parent of <tr> is <thead>
# -> Parent of both <thead> and <tbody> is <table>
# -> <thead> and <tbody> are siblings
# -> Siblings means they are in the same hierarchical level
# --------------------------------------------------
# --------------------------------------------------
# 1 - Tree Siblings
# Getting all table rows which is inside table body
# table_body = doc.tbody
# table_rows = table_body.contents
# 1.1 - Prints all table rows
# print(table_rows)
# 1.2 - Prints the first table row (BTC)
# print(table_rows[0])
# 1.3 - Prints the next table row (ETH)
# print(table_rows[0].next_sibling)
# 1.4 - Prints the previous table row (BTC)
# print(table_rows[1].previous_sibling)
# 1.5 - All the rows that come after the first table row
# print(list(table_rows[0].next_siblings))
# 2 - Parents and Descends
# 2.1 - Give the entire parent (Entire tbody tag)
# print(table_rows[0].parent)
# 2.2 - Give the parent name (tbody)
# print(table_rows[0].parent.name)
# 2.3 - Prints the all the children (contents) inside the tag
# Contents / Children
# print(list(table_rows[1].children))
# print(list(table_rows[0].contents))
# 3 - Getting Crypto Prices for first 10
table_body = doc.tbody
table_rows = table_body.contents
names_and_price = {}
# 3.1 - Interate the content inside the <tr> tag
# for tr in table_rows:
# print(tr)
# print()
# 3.2 - Interate the content inside the <td> tag
# for tr in table_rows:
# for td in tr.contents[2:4]: # Print only the second and forth (Getting name and price only from the table)
# print(td)
# print()
# 3.3 - Storing the name and price into 2 variables
# looking at only the first 10 table rows
# for tr in table_rows[:10]:
# name, price = tr.contents[2:4]
# print(name.p.string) # The name that we want is inside the <p> tag
# print()
# 3.4 - Final Code
for tr in table_rows[:10]:
name, price = tr.contents[2:4] # Content in 2nd column goes to name, Content in 4th column goes to price
cryto_names = name.p.string
crypto_prices = price.span.string
names_and_price[cryto_names] = crypto_prices # Populating both names and prices into the dictionary
print(names_and_price)