-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathfinviz_scraper.py
105 lines (96 loc) · 2.78 KB
/
finviz_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
'''
script to extract data from finviz.com
'''
from bs4 import BeautifulSoup
import re
import sqlite3
import urllib.request, urllib.error, urllib.parse
from pprint import pprint
conn = sqlite3.connect("python_meetup.sqlite")
cursor = conn.cursor()
table_name = "finviz_data"
cursor.execute("DROP TABLE IF EXISTS " + table_name)
cursor.execute("""
CREATE TABLE {} (
no integer,
ticket varchar(10),
company varchar(255),
sector varchar(255),
industry varchar(255),
country varchar(255),
market_cap float(4),
pe float(4),
price float(4),
change float(4),
volume integer
);
""".format(table_name))
conn.commit()
def get_table_headers():
url = "http://finviz.com/screener.ashx?v=111&f=cap_nano,sec_healthcare&r=1"
content = urllib.request.urlopen(url).read()
soup = BeautifulSoup(content, "html.parser")
table_headers = []
for th in soup.select(".table-top"):
table_headers.append(th.get_text().replace(".", ""))
# if you're worried about Big O Notation, use list comprehension
#table_headers = [th.get_text().replace(".", "") for th in soup.select(".table-top")]
ticker = soup.select(".table-top-s")
table_headers.insert(1, ticker[0].get_text())
pprint(table_headers)
return table_headers
#table_headers = get_table_headers()
def get_rows_from_soup(soup, table_headers):
table_row_data = []
count = 0
row_data = {}
for td in soup.select(".screener-body-table-nw"):
row_data[table_headers[count]] = td.get_text()
count += 1
if count >= len(table_headers):
count = 0
table_row_data.append(row_data)
row_data = {}
return table_row_data
def get_data():
headers = get_table_headers()
mysite = 'www.imdavemorrison.com'
initial_number = 1
while mysite == 'www.imdavemorrison.com':
url = "http://www.finviz.com/screener.ashx?v=111&f=cap_nano,sec_healthcare&r={}".format(
initial_number
)
content = urllib.request.urlopen(url).read()
soup = BeautifulSoup(content, "html.parser")
data = get_rows_from_soup(soup, headers)
for row in data:
print(row['No'])
print(row['Ticker'])
print(row['Company'])
print(row['Sector'])
print(row['Industry'])
print(row['Country'])
print(row['Market Cap'][:-1])
print("") if row['P/E'] == "-" else print(row['P/E'])
print(row['Price'])
print(row['Change'][:-1])
print(row['Volume'].replace(',', ""))
print("-" * 100)
cursor.execute('INSERT INTO finviz_data VALUES(?,?,?,?,?,?,?,?,?,?,?)',(
row['No'],
row['Ticker'],
row['Company'],
row['Sector'],
row['Industry'],
row['Country'],
row['Market Cap'][:-1],
"" if row['P/E'] == "-" else row['P/E'],
row['Price'],
row['Change'][:-1],
row['Volume'].replace(',', ""),
))
conn.commit()
initial_number += 20
if not re.findall(b"<b>next</b>", content):
mysite = 'imdavemorrison'
data = get_data()