-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
87 lines (70 loc) · 2.96 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
from requests import Session
import re
import gender_guesser.detector as gender
import pandas as pd
from plot import process_and_save_plot
def fetch_articles_links(session: Session, page: int, headers: dict) -> set:
root = 'https://cerncourier.com'
url_rev = f'{root}/l/reviews'
url = f'{url_rev}/page/{page}'
req = requests.get(url, headers = headers)
soup = BeautifulSoup(req.text, "html.parser")
print(url)
all_links = {link['href'] for link in soup.find_all('a', href=True)}
specific_links = {link for link in all_links if link.startswith('https://cerncourier.com/a/')}
return specific_links
def fetch_article_info(link: str, headers: dict) -> dict:
req = requests.get(link, headers = headers)
soup = BeautifulSoup(req.text, "html.parser")
article_info: dict = {}
try:
article_info['title'] = soup.find("h1", class_= "single-header__heading").get_text(separator=" ").strip()
except:
article_info['title'] = "Not Found"
try:
article_info['date'] = soup.find("div", class_= "single-header__meta").get_text(separator=" ").strip()
except:
article_info['date'] = "Not Found"
author_div = soup.find("div", class_= "author-byline")
if author_div is not None:
article_info['author'] = author_div.get_text(separator=" ").strip()
#print(detector.get_gender(articles_info['author'].split()[0]))
else:
article_info['author'] = "Not Found"
print(article_info['author'])
return article_info
def determine_gender(name: str, detector: gender.Detector) -> str:
if name == "Not Found":
gender = "unknown"
else:
gender = detector.get_gender(name.split()[0])
return gender
if __name__== '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Accept-Encoding': 'utf-8'
}
page: int
articles_urls = set()
articles_info: list = []
detector = gender.Detector()
with requests.Session() as session:
for page in range(1, 82):
specific_links = fetch_articles_links(session, page, headers)
articles_urls = articles_urls|specific_links
print(articles_urls)
for link in articles_urls:
articles_info.append(fetch_article_info(link, headers))
for article_info in articles_info:
article_info['gender'] = determine_gender(article_info['author'], detector)
print(article_info)
df = pd.DataFrame(articles_info)
df['date'] = pd.to_datetime(df['date'], format='%d %B %Y')
# Sort the DataFrame by the 'Date' column
df_sorted = df.sort_values(by='date')
df_sorted = df_sorted.drop(df.columns[[0, 1]], axis=1)
df_sorted.to_csv("reviewers_gender.csv")
process_and_save_plot(input_csv="reviewers_gender.csv")