-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget-results.py
More file actions
52 lines (44 loc) · 2 KB
/
get-results.py
File metadata and controls
52 lines (44 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas as pd
import requests
import bs4
from io import StringIO
import time
table_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'
url = 'https://fbref.com/en/comps/9/passing/Premier-League-Stats'
base_url = 'https://fbref.com'
html = requests.get(table_url)
bs_obj = bs4.BeautifulSoup(html.text,"lxml")
for table in bs_obj.findAll("table"):
df_table = pd.read_html(StringIO(str(table)),extract_links ="all")[0].apply(lambda col: [base_url + v[1] if v[1] else v[0] for v in col])
df_table.columns = [tt[0] or tt[1] for tt in df_table.columns]
df_table.to_csv("table.csv")
break
html = requests.get(url)
bs_obj = bs4.BeautifulSoup(html.text,"lxml")
i = 0
for comment in bs_obj.findAll(string=lambda string:isinstance(string, bs4.Comment)):
c_soup = bs4.BeautifulSoup(comment,"lxml")
for table in c_soup.findAll("table"):
df = pd.read_html(StringIO(str(table)),extract_links ="all")[0].apply(lambda col: [base_url + v[1] if v[1] else v[0] for v in col])
i+=1
df.columns = [tt[1][0] or tt[1][1] or tt[0][0] or tt[0][1] for tt in df.columns]
df = df[df.Rk != 'Rk']
df.to_csv("df"+str(i)+".csv")
df = df_table.merge(df, how = 'left', on = 'Squad')
list_of_dfs = list()
for match_log_url in df.Matches:
time.sleep(5)
print("Getting " + match_log_url)
html = requests.get(match_log_url)
bs_obj = bs4.BeautifulSoup(html.text,"lxml")
for table in bs_obj.findAll("table"):
df_temp = pd.read_html(StringIO(str(table)),extract_links ="all")[0].apply(lambda col: [base_url + v[1] if v[1] else v[0] for v in col])
df_temp.columns = [tt[1][0] or tt[1][1] or tt[0][0] or tt[0][1] for tt in df_temp.columns]
df_temp = df_temp[df_temp.Day != ""]
df_temp["source"] = match_log_url
print(df_temp.shape)
list_of_dfs.append(df_temp)
main_df = pd.concat(list_of_dfs)
main_df.to_csv("main_df.csv")
j_df = df.merge(main_df, how = 'left', left_on = 'Matches', right_on = 'source')
j_df.to_csv("joined_df.csv")